# Linux socket 创建原理

概述

在C语言网络编程中，我们通过socket函数创建fd，何为fd呢？在vfs中，一切皆文件，对于套接字也一样，均是通过实现VFS框架中的结构指针完成创建（详细参考混沌学堂）。本文将详细介绍 server socket的创建原理，为后面分析 socket 的 option （SO_XXXX）原理打下基础。

int serverFD = socket(AF_INET, SOCK_STREAM, 0);

上述源码，我们看到制定了协议簇为 AF_INET ，协议实现为 TCP 协议（SOCK_STREAM 面向流协议）。同样，我们跟进系统调用 sys_socket。

asmlinkage long sys_socket(int family, int type, int protocol)

{

  int retval;

  struct socket *sock;

  retval = sock_create(family, type, protocol, &sock); // 创建socket

  if (retval < 0)

    goto out;

  retval = sock_map_fd(sock); // 将socket 映射为file，同时返回fd

  if (retval < 0)

    goto out_release;

 ...

  return retval;

}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

sock_create 函数

可以看到该函数，将会根据协议簇找到 net_proto_family ，然后调用 net_proto_family的 create 方法来完成socket的数据填充，同时在该方法中传入了 protocol 协议号，表示使用该协议簇中的 protocol 协议，对于本例而言，创建的为 SOCK_STREAM 协议。源码如下。

x static struct net_proto_family *net_families[NPROTO]; // 协议簇数组static struct net_proto_family *net_families[NPROTO]; // 协议簇数组

#define NPROTO  32  // 长度最大为32，目前来说够用



int sock_create(int family, int type, int protocol, struct socket **res)

{

 int i;

 int err;

 struct socket *sock;

 ...

 if (net_families[family] == NULL) { // 指定的协议簇不存在

  i = -EAFNOSUPPORT;

  goto out;

 }

 if (!(sock = sock_alloc())) // 分配 socket 结构

 {

  printk(KERN_WARNING "socket: no more sockets\n");

  i = -ENFILE;  /* Not exactly a match, but its the

      closest posix thing */

  goto out;

 }

 sock->type = type;

 ...

 if ((i = net_families[family]->create(sock, protocol)) < 0) // 根据协议簇填充socket属性

  goto out_module_put;

 ...

}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

sock_alloc 函数

该函数用于分配 socket 结构，可以看到这里通过socketfs的超级块来分配，同时分配的有inode信息，由于这里只是socket 挂入VFS，不涉及实际inode文件操作，所以对于socket的inode而言，所有inode的操作均为空操作。源码如下。

static struct vfsmount *sock_mnt; // socket 文件系统挂载点

static struct file_system_type sock_fs_type = { // 文件系统操作函数

 .name =  "sockfs",

 .get_sb = sockfs_get_sb,

 .kill_sb = kill_anon_super,

};



// socket 超级块操作函数

static struct super_operations sockfs_ops = {

 .alloc_inode = sock_alloc_inode,

 .destroy_inode =sock_destroy_inode,

 .statfs = simple_statfs,

};



// 创建并初始化超级块

static struct super_block *sockfs_get_sb(struct file_system_type *fs_type,

 int flags, const char *dev_name, void *data)

{

 return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC);

}



struct socket *sock_alloc(void)

{

 struct inode * inode;

 struct socket * sock;

 inode = new_inode(sock_mnt->mnt_sb);

 if (!inode)

  return NULL;

 sock = SOCKET_I(inode); // 由于 inode 与 socket 内存连续，均在 struct socket_alloc 结构中，所以可以通过inode指针来获取socket的指针

 ...

 return sock;

}



// 混沌学堂学员应该没问题：超级块管理了fs的元数据，对于inode而言，也是其分配

struct inode *new_inode(struct super_block *sb)

{

 static unsigned long last_ino;

 struct inode * inode;

 spin_lock_prefetch(&inode_lock);

 inode = alloc_inode(sb); // 根据超级块，创建inode

 ...

}



static struct inode *alloc_inode(struct super_block *sb)

{

 static struct address_space_operations empty_aops;

 static struct inode_operations empty_iops;

 static struct file_operations empty_fops;

 struct inode *inode;

  // 尝试调用超级块的 alloc_inode 方法创建，若该函数不存在，那么使用slab分配器分配

 if (sb->s_op->alloc_inode)

  inode = sb->s_op->alloc_inode(sb); 

 else

  inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL);

 if (inode) { // 创建成功，那么初始化inode成员变量，可以看到操作函数均初始化为空操作

  struct address_space * const mapping = &inode->i_data;

  ...

  inode->i_op = &empty_iops;

  inode->i_fop = &empty_fops;

  ...

  inode->i_mapping = mapping;

 }

 return inode;

}



// socket 包装结构，包装 实际 socket 和 inode，注意这里没有使用指针，所以整个内存块包含这两个信息

struct socket_alloc {

 struct socket socket;

 struct inode vfs_inode;

};



// 上述 super_operations 操作的回调函数实现

static struct inode *sock_alloc_inode(struct super_block *sb)

{

  // 首先通过slab分配器分配一个 socket_alloc 结构

 struct socket_alloc *ei;

 ei = (struct socket_alloc *)kmem_cache_alloc(sock_inode_cachep, SLAB_KERNEL);

 if (!ei)

  return NULL;

 init_waitqueue_head(&ei->socket.wait); // 初始化结构中socket的等待链表

  // 初始化 socket_alloc 中socket的成员变量

 ei->socket.fasync_list = NULL;

 ei->socket.state = SS_UNCONNECTED;

 ei->socket.flags = 0;

 ei->socket.ops = NULL;

 ei->socket.sk = NULL;

 ei->socket.file = NULL;

 ei->socket.passcred = 0;

 return &ei->vfs_inode;

}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179

TCP 协议簇初始化原理

前面我们看到会通过 net_families[family]->create(sock, protocol)) 方法初始化socket，那么对于TCP协议簇来说，何时初始化他们呢？

static struct net_proto_family *net_families[NPROTO]; // 协议簇数组



// 注册 net_proto_family 协议出

int sock_register(struct net_proto_family *ops)

{

 ...

 if (net_families[ops->family] == NULL) {

  net_families[ops->family]=ops; // 在对应数组处填入 net_proto_family 指针

  err = 0;

 }

 ...

}



// TCP/IP 协议簇描述结构

struct net_proto_family inet_family_ops = {

 .family = PF_INET,

 .create = inet_create, // 指定填充socket的方法

 .owner = THIS_MODULE,

};



// TCP/IP 协议簇中的具体协议描述数组（这里，我们关注 SOCK_STREAM 即可）

static struct inet_protosw inetsw_array[] =

{

   { // TCP 协议

       .type =   SOCK_STREAM,

       .protocol = IPPROTO_TCP,

       .prot =   &tcp_prot,

       .ops =    &inet_stream_ops, // 指定 tcp 操作函数指针

       .capability = -1,

       .no_check = 0,

       .flags =   INET_PROTOSW_PERMANENT,

   },



   { // UDP 协议

       .type =   SOCK_DGRAM,

       .protocol = IPPROTO_UDP,

       .prot =   &udp_prot,

       .ops =    &inet_dgram_ops,

       .capability = -1,

       .no_check = UDP_CSUM_DEFAULT,

       .flags =   INET_PROTOSW_PERMANENT,

   },

    

   { // 原始socket协议（可监听IP层协议）

       .type =   SOCK_RAW,

       .protocol = IPPROTO_IP,

       .prot =   &raw_prot,

       .ops =    &inet_dgram_ops,

       .capability = CAP_NET_RAW,

       .no_check = UDP_CSUM_DEFAULT,

       .flags =   INET_PROTOSW_REUSE,

   }

};



// TCP 操作结构，这里将函数指针初始化，后面我们在操作该socket时，将会用到以下函数

struct proto_ops inet_stream_ops = {

 .family = PF_INET,

 .owner = THIS_MODULE,

 .release = inet_release,

 .bind =  inet_bind,

 .connect = inet_stream_connect,

 .socketpair = sock_no_socketpair,

 .accept = inet_accept,

 .getname = inet_getname,

 .poll =  tcp_poll,

 .ioctl = inet_ioctl,

 .listen = inet_listen,

 .shutdown = inet_shutdown,

 .setsockopt = inet_setsockopt,

 .getsockopt = inet_getsockopt,

 .sendmsg = inet_sendmsg,

 .recvmsg = inet_recvmsg,

 .mmap =  sock_no_mmap,

 .sendpage = tcp_sendpage

};





// 协议初始化

static int __init inet_init(void){

 ...

 (void)sock_register(&inet_family_ops); // 将TCP/IP 协议簇注册到 net_families 数组中

 ...

  for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) // 循环遍历 inetsw_array 数组，注册协议簇中的协议：SOCK_STREAM TCP 协议

  inet_register_protosw(q);

 ...

}



static struct list_head inetsw[SOCK_MAX]; // TCP/IP 协议簇中协议数组



void inet_register_protosw(struct inet_protosw *p)

{

 ...

 list_for_each(lh, &inetsw[p->type]) { // 遍历 inet_protosw *p 数组，将其添加到 inetsw 数组中

  answer = list_entry(lh, struct inet_protosw, list);



  /* Check only the non-wild match. */

  if (INET_PROTOSW_PERMANENT & answer->flags) {

   if (protocol == answer->protocol)

    break;

   last_perm = lh;

  }



  answer = NULL;

 }

 ...

}

inet_create 函数

前面我们看到在TCP/IP协议簇初始化过程中，将该函数作为 struct net_proto_family inet_family_ops TCP操作创建socket的实现。通过源码我们看到，将会调用 sk_alloc 分配Linux 底层网络结构 sock，随后遍历协议数组，匹配 protocol 指定的协议簇中的协议，然后使用匹配的 inet_protosw *answer 协议中定义的操作函数初始化 sock。源码如下。

static int inet_create(struct socket *sock, int protocol)

{

 struct sock *sk;

 struct list_head *p;

 struct inet_protosw *answer;

 struct inet_opt *inet;

 int err = -ENOBUFS;

 sock->state = SS_UNCONNECTED; // sock 状态为未连接状态

 sk = sk_alloc(PF_INET, GFP_KERNEL, inet_sk_size(protocol),

     inet_sk_slab(protocol)); // 分配Linux 内核底层 sock 结构，上层的socket结构为满足 bsd socket 规范，实际操作将由 sock 结构来完成

 if (!sk)

  goto out;



 ...

 list_for_each_rcu(p, &inetsw[sock->type]) { // 遍历TCP/IP协议簇中的协议数组，找到传入 protocol 参数指定的协议

  answer = list_entry(p, struct inet_protosw, list);

  if (protocol == answer->protocol) { // 精确匹配

   if (protocol != IPPROTO_IP)

    break;

  } else { // 默认情况下我们指定的 protocol 为 0 ，此时将使用 IPPROTO_IP（ IPPROTO_IP = 0 ） 对比，所以此时，将默认选取数组中的第一个协议：

   if (IPPROTO_IP == protocol) {

    protocol = answer->protocol;

    break;

   }

   if (IPPROTO_IP == answer->protocol)

    break;

  }

  answer = NULL;

 }

 ...

  // 将协议中指定的 ops 赋值到sock 和 sk 结构中

 sock->ops = answer->ops;

 sk->sk_prot = answer->prot;

 sk->sk_no_check = answer->no_check;

 ...

  sock_init_data(sock, sk); // 初始化 socket和sock数据

 ...

}



// 默认接收缓存区和发送缓冲区内存大小

__u32 sysctl_wmem_default = SK_WMEM_MAX;

__u32 sysctl_rmem_default = SK_RMEM_MAX;

#define SK_WMEM_MAX 65535

#define SK_RMEM_MAX 65535





void sock_init_data(struct socket *sock, struct sock *sk)

{

  // 初始化 sk 队列

 skb_queue_head_init(&sk->sk_receive_queue);

 skb_queue_head_init(&sk->sk_write_queue);

 skb_queue_head_init(&sk->sk_error_queue);

 init_timer(&sk->sk_timer); // 初始化 sk 计时器

  // 初始化 接收、发送、状态等信息并双向绑定 sock 与 sk（这里了解即可，在后面用到时我们还会回来看看这里的值）

 sk->sk_allocation = GFP_KERNEL;

 sk->sk_rcvbuf  = sysctl_rmem_default; // 接收队列

 sk->sk_sndbuf  = sysctl_wmem_default; // 发送队列

 sk->sk_state  = TCP_CLOSE;

 sk->sk_zapped  = 1;

 sk->sk_socket  = sock;

 if(sock)

 {

  sk->sk_type = sock->type;

  sk->sk_sleep = &sock->wait;

  sock->sk = sk;

 } else

  sk->sk_sleep = NULL;

 sk->sk_dst_lock  = RW_LOCK_UNLOCKED;

 sk->sk_callback_lock = RW_LOCK_UNLOCKED;

 sk->sk_state_change = sock_def_wakeup;

 sk->sk_data_ready = sock_def_readable;

 sk->sk_write_space = sock_def_write_space;

 sk->sk_error_report = sock_def_error_report;

 sk->sk_destruct  = sock_def_destruct;

 sk->sk_peercred.pid  = 0;

 sk->sk_peercred.uid = -1;

 sk->sk_peercred.gid = -1;

 sk->sk_rcvlowat  = 1;

 sk->sk_rcvtimeo  = MAX_SCHEDULE_TIMEOUT; // 接收超时时间

 sk->sk_sndtimeo  = MAX_SCHEDULE_TIMEOUT; // 发送超时时间

 sk->sk_owner  = NULL;

 atomic_set(&sk->sk_refcnt, 1);

}

sock_map_fd 函数

该函数用于将 socket 映射到 file 结构，同时将file结构放入进程的file数组中，同时返回数组下标 fd（详细参考混沌学堂介绍）。

// socket 文件操作函数指针结构

static struct file_operations socket_file_ops = {

 .owner = THIS_MODULE,

 .llseek = no_llseek,

 .aio_read = sock_aio_read,

 .aio_write = sock_aio_write,

 .poll =  sock_poll,

 .ioctl = sock_ioctl,

 .mmap =  sock_mmap,

 .open =  sock_no_open,

 .release = sock_close,

 .fasync = sock_fasync,

 .readv = sock_readv,

 .writev = sock_writev,

 .sendpage = sock_sendpage

};







int sock_map_fd(struct socket *sock)

{

 int fd;

 struct qstr this; // 保存字符串信息，用于创建 dentry

 char name[32];

 fd = get_unused_fd(); // 找到一个未使用的fd下标，用于存放 file 结构指针

 if (fd >= 0) {

  struct file *file = get_empty_filp(); // 分配 file 结构

  if (!file) {

   put_unused_fd(fd);

   fd = -ENFILE;

   goto out;

  } 

  sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino);

  this.name = name;

  this.len = strlen(name);

  this.hash = SOCK_INODE(sock)->i_ino;

  file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this); // 分配一个新的文件目录，并指定父目录为超级块的根目录

  if (!file->f_dentry) {

   put_filp(file);

   put_unused_fd(fd);

   fd = -ENOMEM;

   goto out;

  }

    // 初始化file结构属性，同时将file与socket绑定

  file->f_dentry->d_op = &sockfs_dentry_operations;

  d_add(file->f_dentry, SOCK_INODE(sock));

  file->f_vfsmnt = mntget(sock_mnt);

  sock->file = file;

  file->f_op = SOCK_INODE(sock)->i_fop = &socket_file_ops; // 文件操作与inode操作初始化为 socket_file_ops

  file->f_mode = 3;

  file->f_flags = O_RDWR;

  file->f_pos = 0;

  fd_install(fd, file); // 将 file 结构放入到 fd 下标处

 }

out:

 return fd;

}



// file 结构放入到 fd 下标

void fd_install(unsigned int fd, struct file * file)

{

 struct files_struct *files = current->files;

 spin_lock(&files->file_lock);

 if (unlikely(files->fd[fd] != NULL))

  BUG();

 files->fd[fd] = file;

 spin_unlock(&files->file_lock);

}

← Unsafe 操作基本数据类型与对象引发的问题分析 Linux 端口绑定原理 →