存档

2011年2月 的存档

浅析linux kernel network之socket创建

二月 20th, {2011 6 条评论 12,635 人阅读过  

去年受@colyli指点,决定花些时间读一些linux kernel network部分的代码,准备把阅读代码的过程记录下来,也希望能有大牛前来指点,下面就先写一下创建socket对象的过程:

首先,创建socket需要执行socket系统调用:

int socket(int domain, int type, int protocol);

该系统调用有3个参数,在内核中由SYSCALL_DEFINE3定义,具体代码如下:

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
	int retval;
	struct socket *sock;
	int flags;
 
	flags = type & ~SOCK_TYPE_MASK;
	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
		return -EINVAL;
	type &= SOCK_TYPE_MASK;
 
	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
 
	retval = sock_create(family, type, protocol, &sock);
	if (retval < 0)
		goto out;
 
	retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
	if (retval < 0)
		goto out_release;
 
out:
	/* It may be already another descriptor 8) Not kernel problem. */
	return retval;
 
out_release:
	sock_release(sock);
	return retval;
}

上述代码中首先做了一个类型检查,type即我们所熟知的SOCK_STREAM,SOCK_DGRAM等sock_type枚举,其取值范围为1-10,均位于type字段的低八位,而SOCK_CLOEXEC 和SOCK_NONBLOCK为于高位,SOCK_TYPE_MASK宏的值为0xF,该检查的作用是检查除了基本的socket类型和SOCK_CLOEXEC和SOCK_NONBLOCK选项之外是否设置了其它选项,如果有,则返回INVLAID。

之后的检查我理解为是一个兼容性检查,如果设置了SOCK_NONBLOCK选项,则不管SOCK_NONBLOCK的值是否定义为与O_NONBLOCK相同,均将socket的O_NONBLOCK位置位,而将SOCK_NONBLOCK位复位。

随便调用sock_create创建一个新的socket对象,之后再调用sock_map_fd将该socket对象影射为文件描述符retval,随即将其返回,从而函数得到一个socket描述符。sock_create函数直接调用了__sock_create函数来创建socket对象,具体看__sock_create函数的实现:

int __sock_create(struct net *net, int family, int type, int protocol,
			 struct socket **res, int kern)
{
	int err;
	struct socket *sock;
	const struct net_proto_family *pf;
 
	if (family < 0 || family >= NPROTO)
		return -EAFNOSUPPORT;
	if (type < 0 || type >= SOCK_MAX)
		return -EINVAL;
 
	if (family == PF_INET && type == SOCK_PACKET) {
		static int warned;
		if (!warned) {
			warned = 1;
			printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
			       current->comm);
		}
		family = PF_PACKET;
	}
 
	err = security_socket_create(family, type, protocol, kern);
	if (err)
		return err;
 
	sock = sock_alloc();
	if (!sock) {
		if (net_ratelimit())
			printk(KERN_WARNING "socket: no more sockets\n");
		return -ENFILE;	/* Not exactly a match, but its the
				   closest posix thing */
	}
 
	sock->type = type;
 
#ifdef CONFIG_MODULES
	if (net_families[family] == NULL)
		request_module("net-pf-%d", family);
#endif
 
	rcu_read_lock();
	pf = rcu_dereference(net_families[family]);
	err = -EAFNOSUPPORT;
	if (!pf)
		goto out_release;
 
	/*
	 * We will call the ->create function, that possibly is in a loadable
	 * module, so we have to bump that loadable module refcnt first.
	 */
	if (!try_module_get(pf->owner))
		goto out_release;
 
	/* Now protected by module ref count */
	rcu_read_unlock();
 
	err = pf->create(net, sock, protocol, kern);
	if (err < 0)
		goto out_module_put;
 
	/*
	 * Now to bump the refcnt of the [loadable] module that owns this
	 * socket at sock_release time we decrement its refcnt.
	 */
	if (!try_module_get(sock->ops->owner))
		goto out_module_busy;
 
	/*
	 * Now that we're done with the ->create function, the [loadable]
	 * module can have its refcnt decremented
	 */
	module_put(pf->owner);
	err = security_socket_post_create(sock, family, type, protocol, kern);
	if (err)
		goto out_sock_release;
	*res = sock;
 
	return 0;
 
out_module_busy:
	err = -EAFNOSUPPORT;
out_module_put:
	sock->ops = NULL;
	module_put(pf->owner);
out_sock_release:
	sock_release(sock);
	return err;
 
out_release:
	rcu_read_unlock();
	goto out_sock_release;
}

开始先进行安全性检查和兼容性检查,security_socket_create()是个空函数,可以忽略。之后调用sock_alloc()函数在VFS上分配一个struct socket对象,所有的协议类型创建socket时创建的均为这个对象,可以理解为是所有网络层socket的模板或者说父类,上层协议栈在初始化socket时会根据这个已创建好的struct socket对象创建并初始化一个struct sock对象,这个对象包含更多上层协议栈的详细信息。

接下来的net_families数据是一个全局变量,在系统初始化时在inet_init()函数内进行初始化,其定义如下:

static const struct net_proto_family *net_families[NPROTO] __read_mostly;

每个协议族都会在该数据中对应一个net_proto_family结构体,当然,未实现的协议族中对应位置的结构体指针为空,我们只关心最常用的协议族即AF_INET,其值为2,而NPROTO的值等于AF_MAX,在2.6.37内核中值为38。刚才提到该数组在inet_init()中被初始化,查看该函数相关代码可知它调用sock_register()函数注册INET协议族,代码如下:

(void)sock_register(&inet_family_ops);

其中inet_family_ops是一个全局静态变量,其定义如下:

static const struct net_proto_family inet_family_ops = {
	.family = PF_INET,
	.create = inet_create, /* 该函数在我们创建socket的过程中起着很关键的作用 */
	.owner	= THIS_MODULE,
};

再继续查看sock_register()函数,其代码如下:

int sock_register(const struct net_proto_family *ops)
{
	int err;
 
	if (ops->family >= NPROTO) {
		printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,
		       NPROTO);
		return -ENOBUFS;
	}
 
	spin_lock(&net_family_lock);
	if (net_families[ops->family])
		err = -EEXIST;
	else {
		net_families[ops->family] = ops;
		err = 0;
	}
	spin_unlock(&net_family_lock);
 
	printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
	return err;
}

代码很简单,即将net_proto_family对象插入到net_families数据对应的位置中,由此来完成对net_families的初始化。

OK,继续返回刚才的__sock_create()函数,看下面的代码:

#ifdef CONFIG_MODULES
	if (net_families[family] == NULL)
		request_module("net-pf-%d", family);
#endif

如果编译时支持可安装模块,则首先检测net_families数组中对应的family是否存在,假设此处为AF_INET,如果不存在,即在系统初始化时没有通过sock_register()函数对该元素进行注册,则调用request_module()动态地安装模块net-pf-2。

再往下通过RCU机制获取net_families中对应的net_proto_family对象,RCU机制主要用于网络层和VFS中,关于RCU机制的更多细节可以参考@gnawux师兄的译文:What is RCU, Fundamentally?

之后由注释可知增加该模块的引用计数,重要的函数为pf->create(),当协议族为AF_INET时,由以上可知,create函数为inet_create(),由这个函数进一步创建该socket,我们再看inet_create()函数的实现:

static int inet_create(struct net *net, struct socket *sock, int protocol,
		       int kern)
{
	struct sock *sk;
	struct inet_protosw *answer;
	struct inet_sock *inet;
	struct proto *answer_prot;
	unsigned char answer_flags;
	char answer_no_check;
	int try_loading_module = 0;
	int err;
 
        /* 检查是否有加密字符串,如果没有则查检socket类型,
           只有TCP需要加密字符串,如果协议类型不是SOCK_RAW和SOCK_DGRAM的话,
            调用build_ehash_secret()函数创建一个加密字符串 */
	if (unlikely(!inet_ehash_secret))
		if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
			build_ehash_secret();
 
        /* 将socket状态设置为未连接 */
	sock->state = SS_UNCONNECTED;
 
	/* Look for the requested type/protocol pair. */
lookup_protocol:
	err = -ESOCKTNOSUPPORT;
	rcu_read_lock();
 
        /* 遍历inetsw数组对应请求类型的链表元素,
           如果protocol不是通配符类型,即IPPROTO_IP,
           该宏的值为0,则在链表中搜索该协议类型,
           如未找到则返回协议类型不支持,如果是通配符类型,
           则选择一个适合的协议类型,至于inetsw数组,见下面分析 */
	list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
 
		err = 0;
		/* Check the non-wild match. */
		if (protocol == answer->protocol) {
			if (protocol != IPPROTO_IP)
				break;
		} else {
			/* Check for the two wild cases. */
			if (IPPROTO_IP == protocol) {
				protocol = answer->protocol;
				break;
			}
			if (IPPROTO_IP == answer->protocol)
				break;
		}
		err = -EPROTONOSUPPORT;
	}
        /* 动态加载相关协议模块 */
	if (unlikely(err)) {
		if (try_loading_module < 2) {
			rcu_read_unlock();
			/*
			 * Be more specific, e.g. net-pf-2-proto-132-type-1
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
			 */
			if (++try_loading_module == 1)
				request_module("net-pf-%d-proto-%d-type-%d",
					       PF_INET, protocol, sock->type);
			/*
			 * Fall back to generic, e.g. net-pf-2-proto-132
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP)
			 */
			else
				request_module("net-pf-%d-proto-%d",
					       PF_INET, protocol);
			goto lookup_protocol;
		} else
			goto out_rcu_unlock;
	}
 
	err = -EPERM;
        /* 众所周知,只有root权限用户才可以创始原始套接字,
           此处检查是否有权限创建SOCK_RAW类型的套接字,
           如果无权限,则返回-EPERM */
	if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
		goto out_rcu_unlock; 
	err = -EAFNOSUPPORT;
       /* 检查协议类型是否支持 */
	if (!inet_netns_ok(net, protocol))
		goto out_rcu_unlock;
 
	sock->ops = answer->ops;
	answer_prot = answer->prot;
	answer_no_check = answer->no_check;
	answer_flags = answer->flags;
	rcu_read_unlock();
 
	WARN_ON(answer_prot->slab == NULL);
 
	err = -ENOBUFS;
	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
	if (sk == NULL)
		goto out;
 
	err = 0;
	sk->sk_no_check = answer_no_check;
	if (INET_PROTOSW_REUSE & answer_flags)
		sk->sk_reuse = 1;
 
	inet = inet_sk(sk);
        /* 是否是基于连接的socket,目前只有SOCK_STREAM是基于连接的socket */
	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
 
	inet->nodefrag = 0;
 
	if (SOCK_RAW == sock->type) {
		inet->inet_num = protocol;
                /* 如果是原始套接字,则需要创建IP头部 */
		if (IPPROTO_RAW == protocol)
			inet->hdrincl = 1;
	}
 
        /* 是否启用路径MTU发现机制,可以在修改proc文件开启或关闭:
           /proc/sys/net/ipv4/ip_no_pmtu_disc */
	if (ipv4_config.no_pmtu_disc)
		inet->pmtudisc = IP_PMTUDISC_DONT;
	else
		inet->pmtudisc = IP_PMTUDISC_WANT;
 
	inet->inet_id = 0;
 
        /* 该函数利用sock的内容进一步初始化sk */
	sock_init_data(sock, sk);
 
	sk->sk_destruct	   = inet_sock_destruct;
	sk->sk_protocol	   = protocol;
	sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
 
	inet->uc_ttl	= -1;
	inet->mc_loop	= 1;
	inet->mc_ttl	= 1;
	inet->mc_all	= 1;
	inet->mc_index	= 0;
	inet->mc_list	= NULL;
 
	sk_refcnt_debug_inc(sk);
 
	if (inet->inet_num) {
		/* It assumes that any protocol which allows
		 * the user to assign a number at socket
		 * creation time automatically
		 * shares.
		 */
		inet->inet_sport = htons(inet->inet_num);
		/* Add to protocol hash chains. */
		sk->sk_prot->hash(sk);
	}
 
	if (sk->sk_prot->init) {
		err = sk->sk_prot->init(sk);
		if (err)
			sk_common_release(sk);
	}
out:
	return err;
out_rcu_unlock:
	rcu_read_unlock();
	goto out;
}

说一下inetsw这个数组,它也是针对于AF_INET协议族而言的,是一个全局静态变量,包含着创建一个新的socket所需要的所有信息,定义如下:

static struct list_head inetsw[SOCK_MAX];

其中每个元素都是一个双向链表,该数据的定义也是在inet_init()函数中完成的,看其中相关代码:

for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
	INIT_LIST_HEAD(r);
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
	inet_register_protosw(q);

首先初始化每个链表元素的头,接下来将inetsw_array这个数组中的元素使用inet_register_protosw()函数注册到inetsw数组中,inetsw_array也是一个全局的静态变量,其定义如下:

static struct inet_protosw inetsw_array[] =
{
	{
		.type =       SOCK_STREAM,
		.protocol =   IPPROTO_TCP,
		.prot =       &tcp_prot,
		.ops =        &inet_stream_ops,
		.no_check =   0,
		.flags =      INET_PROTOSW_PERMANENT |
			      INET_PROTOSW_ICSK,
	},
 
	{
		.type =       SOCK_DGRAM,
		.protocol =   IPPROTO_UDP,
		.prot =       &udp_prot,
		.ops =        &inet_dgram_ops,
		.no_check =   UDP_CSUM_DEFAULT,
		.flags =      INET_PROTOSW_PERMANENT,
       },
 
 
       {
	       .type =       SOCK_RAW,
	       .protocol =   IPPROTO_IP,	/* wild card */
	       .prot =       &raw_prot,
	       .ops =        &inet_sockraw_ops,
	       .no_check =   UDP_CSUM_DEFAULT,
	       .flags =      INET_PROTOSW_REUSE,
       }
};

它包含了各种协议类型所需要的基本信息。

接下来很重要的一步,就是调用sk_alloc()函数创建一个新的struct sock对象,struct socket和struct sock的区别在于,struct socket是创建每个BSD所必须的,它描述BSD socket的一些基本信息,而struct sock则描述网络层的相关信息,它在struct socket的基础上构建。看一下sk_sock()的定义:

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
		      struct proto *prot)
{
	struct sock *sk;
 
	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
	if (sk) {
		sk->sk_family = family;
		/*
		 * See comment in struct sock definition to understand
		 * why we need sk_prot_creator -acme
		 */
		sk->sk_prot = sk->sk_prot_creator = prot;
		sock_lock_init(sk);
		sock_net_set(sk, get_net(net));
		atomic_set(&sk->sk_wmem_alloc, 1);
 
		sock_update_classid(sk);
	}
 
	return sk;
}

最后一个参数为当前协议类型对应的struct proto对象,将刚创建的struct sock对象的sk_prot和sk_prot_creator成员初始化为prot。sk_prot_alloc()函数用于创建sock对象,可以在slab分配器上创建,也可以在普通缓存中创建。

接下来贝sock_init_data()函数进一步初始化sk:

void sock_init_data(struct socket *sock, struct sock *sk)
{
        /* 初始化接收/发送/异常缓冲队列,这些队列均为双向链表,
           节点数据内容为struct sk_buff对象, 各种数据包的信息都
           存放在该结构体中。队列sk_error_queue很少使用 */
	skb_queue_head_init(&sk->sk_receive_queue);
	skb_queue_head_init(&sk->sk_write_queue);
	skb_queue_head_init(&sk->sk_error_queue);
#ifdef CONFIG_NET_DMA
	skb_queue_head_init(&sk->sk_async_wait_queue);
#endif
 
	sk->sk_send_head	=	NULL;
 
	init_timer(&sk->sk_timer);
 
	sk->sk_allocation	=	GFP_KERNEL;
        /* 接收缓冲区最大字节数 */
	sk->sk_rcvbuf		=	sysctl_rmem_default;
        /* 发送缓冲区最大字节数 */
	sk->sk_sndbuf		=	sysctl_wmem_default;
        /* 连接状态,SOCK_DGRAM和SOCK_RAW也会共用TCP的一些状态,
         连接刚建立的时候都会使用TCP_CLOSE状态 */
	sk->sk_state		=	TCP_CLOSE;
	sk_set_socket(sk, sock);
 
	sock_set_flag(sk, SOCK_ZAPPED);
 
	if (sock) {
		sk->sk_type	=	sock->type;
		sk->sk_wq	=	sock->wq;
		sock->sk	=	sk;
	} else
		sk->sk_wq	=	NULL;
 
	spin_lock_init(&sk->sk_dst_lock);
	rwlock_init(&sk->sk_callback_lock);
	lockdep_set_class_and_name(&sk->sk_callback_lock,
			af_callback_keys + sk->sk_family,
			af_family_clock_key_strings[sk->sk_family]);
 
	sk->sk_state_change	=	sock_def_wakeup;
	sk->sk_data_ready	=	sock_def_readable;
	sk->sk_write_space	=	sock_def_write_space;
	sk->sk_error_report	=	sock_def_error_report;
	sk->sk_destruct		=	sock_def_destruct;
 
	sk->sk_sndmsg_page	=	NULL;
	sk->sk_sndmsg_off	=	0;
 
	sk->sk_peer_pid 	=	NULL;
	sk->sk_peer_cred	=	NULL;
	sk->sk_write_pending	=	0;
	sk->sk_rcvlowat		=	1;
	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
 
	sk->sk_stamp = ktime_set(-1L, 0);
 
	/*
	 * Before updating sk_refcnt, we must commit prior changes to memory
	 * (Documentation/RCU/rculist_nulls.txt for details)
	 */
	smp_wmb();
	atomic_set(&sk->sk_refcnt, 1);
	atomic_set(&sk->sk_drops, 0);
}

在inet_create()函数的最后还有一个很重要的初始化,下面这段代码:

if (sk->sk_prot->init) {
	err = sk->sk_prot->init(sk);
	if (err)
		sk_common_release(sk);
}

前面在sk_alloc()函数中为sk初始化时将协议类型对应的proto对象指针赋给了sk->sk_prot,这里的init()函数即对应的proto中的init函数,当协议类型为SOCK_STREAM时,prot即为tcp_proto(见inetsw_array), 对应的init()函数即为:

static int tcp_v4_init_sock(struct sock *sk)

分类: Kernel 标签: