美文网首页
linux内核tcp协议栈走读记录(一)

linux内核tcp协议栈走读记录(一)

作者: help_youself | 来源:发表于2019-07-18 15:33 被阅读0次

 先看[1]中的两段代码:

 ret = sock_create_kern(&init_net, AF_INET, SOCK_STREAM, 0, &sock);
ret = sock->ops->connect(sock, (struct sockaddr *)&s_addr, sizeof(s_addr), 0);

 sock是怎么创建的?sock->ops中有多少操作?

int __init inet_init(void)
{
(void)sock_register(&inet_family_ops);
}
//af_inet.c
static const struct net_proto_family inet_family_ops = {
    .family = PF_INET,
    .create = inet_create,
/*  .owner  = THIS_MODULE,*/
};
int sock_create_kern(int family, int type, int protocol, struct socket **res)
{
    return __sock_create(&init_net, family, type, protocol, res, 1);
}
int __sock_create(struct net *net, int family, int type, int protocol,
             struct socket **res, int kern)
{
err = pf->create(net, sock, protocol, kern);
}
//这里的pf->create指向inet_create函数

 接下来就需要分析inet_create(af_inet.c)的处理流程。

static int inet_create(struct net *net, struct socket *sock, int protocol,
               int kern)
{
    sock->ops = answer->ops;
    answer_prot = answer->prot;
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
sock_init_data(sock, sk);
}
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
              struct proto *prot)
{
    sk->sk_prot = sk->sk_prot_creator = prot;
}

 protocol就是在应用层调用sock函数传输的参数,对应三种数据包的处理方式,主要SOCK_STREAM(TCP)、SOCK_DGRAM(UDP)、SOCK_RAW(原始socket)。代码中的answer指针,就是根据protocol选择了inetsw_array中的某一个。sock->ops根据上面的代码,可知:sock->ops = answer->ops。

static struct inet_protosw inetsw_array[] =
{
    {
        .type =       SOCK_STREAM,
        .protocol =   IPPROTO_TCP,
        .prot =       &tcp_prot,
        .ops =        &inet_stream_ops,
        .no_check =   0,
        .flags =      INET_PROTOSW_PERMANENT |
                  INET_PROTOSW_ICSK,
    },

    {
        .type =       SOCK_DGRAM,
        .protocol =   IPPROTO_UDP,
        .prot =       &udp_prot,
        .ops =        &inet_dgram_ops,
        .no_check =   UDP_CSUM_DEFAULT,
        .flags =      INET_PROTOSW_PERMANENT,
       },

       {
        .type =       SOCK_DGRAM,
        .protocol =   IPPROTO_ICMP,
        .prot =       &ping_prot,
        .ops =        &inet_dgram_ops,
        .no_check =   UDP_CSUM_DEFAULT,
        .flags =      INET_PROTOSW_REUSE,
       },

       {
           .type =       SOCK_RAW,
           .protocol =   IPPROTO_IP,    /* wild card */
           .prot =       &raw_prot,
           .ops =        &inet_sockraw_ops,
           .no_check =   UDP_CSUM_DEFAULT,
           .flags =      INET_PROTOSW_REUSE,
       }
};

 以tcp为例,这里的ops指针中内容是:

const struct proto_ops inet_stream_ops = {
    .family        = PF_INET,
    /*.owner           = THIS_MODULE,*/
    .release       = inet_release,
    .bind          = inet_bind,
    .connect       = inet_stream_connect,
    .socketpair    = sock_no_socketpair,
    .accept        = inet_accept,
    .getname       = inet_getname,
//  .poll          = tcp_poll,
    .ioctl         = inet_ioctl,
    .listen        = inet_listen,
    .shutdown      = inet_shutdown,
    .setsockopt    = sock_common_setsockopt,
    .getsockopt    = sock_common_getsockopt,
    .sendmsg       = inet_sendmsg,
    .recvmsg       = inet_recvmsg,
//  .mmap          = sock_no_mmap,
    .sendpage      = inet_sendpage,
//  .splice_read       = tcp_splice_read,
#ifdef CONFIG_COMPAT
    .compat_setsockopt = compat_sock_common_setsockopt,
    .compat_getsockopt = compat_sock_common_getsockopt,
    .compat_ioctl      = inet_compat_ioctl,
#endif
};

 回到文首提到的sock->ops->connect操作,就是执行了inet_stream_connect。在上面提到sk_alloc函数, sk->sk_prot指向的就是answer->prot。

int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
            int addr_len, int flags)
{
    err = __inet_stream_connect(sock, uaddr, addr_len, flags);
}
int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
              int addr_len, int flags)
{
err = sk->sk_prot->connect(sk, uaddr, addr_len);
}

 如果是tcp协议,answer->prot就是指向了tcp_prot。结构体tcp_prot(tcp_ipv4.c)中的内容:

struct proto tcp_prot = {
    .name           = "TCP",
    //.owner            = THIS_MODULE,
    .close          = tcp_close,
    .connect        = tcp_v4_connect,
    .disconnect     = tcp_disconnect,
    .accept         = inet_csk_accept,
    .ioctl          = tcp_ioctl,
    .init           = tcp_v4_init_sock,
    .destroy        = tcp_v4_destroy_sock,
    .shutdown       = tcp_shutdown,
    .setsockopt     = tcp_setsockopt,
    .getsockopt     = tcp_getsockopt,
    .recvmsg        = tcp_recvmsg,
    .sendmsg        = tcp_sendmsg,
    .sendpage       = tcp_sendpage,
    .backlog_rcv        = tcp_v4_do_rcv,
    .release_cb     = tcp_release_cb,
    .mtu_reduced        = tcp_v4_mtu_reduced,
    .hash           = inet_hash,
    .unhash         = inet_unhash,
    .get_port       = inet_csk_get_port,
    .enter_memory_pressure  = tcp_enter_memory_pressure,
    .stream_memory_free = tcp_stream_memory_free,
    .sockets_allocated  = &tcp_sockets_allocated,
    .orphan_count       = &tcp_orphan_count,
    .memory_allocated   = &tcp_memory_allocated,
    .memory_pressure    = &tcp_memory_pressure,
    .sysctl_mem     = sysctl_tcp_mem,
    .sysctl_wmem        = sysctl_tcp_wmem,
    .sysctl_rmem        = sysctl_tcp_rmem,
    .max_header     = MAX_TCP_HEADER,
    .obj_size       = sizeof(struct tcp_sock),
//  .slab_flags     = SLAB_DESTROY_BY_RCU,
    .twsk_prot      = &tcp_timewait_sock_ops,
    .rsk_prot       = &tcp_request_sock_ops,
    .h.hashinfo     = &tcp_hashinfo,
    .no_autobind        = true,
#ifdef CONFIG_COMPAT
    .compat_setsockopt  = compat_tcp_setsockopt,
    .compat_getsockopt  = compat_tcp_getsockopt,
#endif
#ifdef CONFIG_MEMCG_KMEM
    .init_cgroup        = tcp_init_cgroup,
    .destroy_cgroup     = tcp_destroy_cgroup,
    .proto_cgroup       = tcp_proto_cgroup,
#endif
};

 sk->sk_prot->connect实际执行的就是tcp_v4_connect函数。
 博客[5]会继续分析tcp_v4_connect之后的操作,比如源端口的分配。端口的分配与函数inet_hash_connect相关,基于博客[6]中描述的原则。

如果用户已经绑定了端口,就使用绑定的端口。
如果用户没有绑定端口,则让系统自动选取,策略如下:

  1. 获取端口的取值区间,以及区间内端口的个数。
  2. 根据初始偏移量,从端口区间内的某个端口开始,遍历整个区间。
    2.1 如果端口是保留的,直接跳过。
    2.2 如果端口已经被使用了。
    2.2.1 不允许复用已经被bind()的端口。
    2.2.2 检查端口是否能被重用,可以的话就重用此端口。
    2.3 如果端口没有被使用过,就选择此端口。
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
    rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
                  RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
                  IPPROTO_TCP,
                  orig_sport, orig_dport, sk);
err = inet_hash_connect(&tcp_death_row, sk);
}
int inet_hash_connect(struct inet_timewait_death_row *death_row,
              struct sock *sk)
{
    return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
            __inet_check_established, __inet_hash_nolisten);
}
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
        struct sock *sk, u32 port_offset,
        int (*check_established)(struct inet_timewait_death_row *,
            struct sock *, __u16, struct inet_timewait_sock **),
        int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
{
    if (!snum) {
  //端口分配
}
  inet_bind_hash(sk, tb, port);
}

reference:
[1] 一个简单的内核Socket Client例子
[2] 2.1 Socket系统调用
[3]第一个Linux网络设备驱动——最简虚拟网卡virnet
[4] Linux串口网卡(一)——通用虚拟网卡的实现
[5] Linux TCP/IP 协议栈之 Socket的实现分析(Connect客户端发起连接请求)
[6] TCP连接建立系列 — 客户端的端口选取和重用
[7] 路由表

相关文章

网友评论

      本文标题:linux内核tcp协议栈走读记录(一)

      本文链接:https://www.haomeiwen.com/subject/ktullctx.html