kernel/net再贯穿：dev-ip-tcp stack、epoll、netns、iptables etc...

之前看libpcap/tcpdump源码的时候就看过部分kernel/net部分代码。上周日偶然间看到刚出版不久的《深入理解Linux网络》这本书，而作者又恰好是我当时阅读libpcap参考文章的公众号维护者，简单看了一下目录果断准备下单（这次第一次尝试学校图书馆的借购功能，别说还真方便，直接平台下单收货看了还给图书馆就行）。

四五天时间看完了，也把kernel/net部分代码再更深入地看了一下，也看到书上的一点问题，已经通过微信告知作者。总体上来说是本不错的书籍，我也收获良多。每章前面都会提出一些问题，通过一章的源码及原理解读，然后在章节末尾给出答案，大部分问题是随着看书获得解答的，但是有一些问题有些宽泛或者吸引眼球之嫌。大部分章节是值得看的，但是也有部分章节存在水篇幅之嫌，假如想要快速浏览，建议看书本插图以及每章总结即可。无论怎么说这是一本优质的书(否则也不可能短短几月售出上万册)，插图质量也甚是不错。

那我也以书籍章节的逻辑进行本文布局吧。

内核收包

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14


# 硬中断
igb_msix_ring
# 软中断
napi_poll
# 网卡poll数据
igb_poll
# 摘下skb
igb_clean_rx_irq
# 传输skb
deliver_skb
# ip处理
ip_rcv
# tcp处理
tcp_v4_rcv

注册per-CPU queues softnet_data
注册软中断处理函数到softirq_vec 后续根据软中断类型进行不同处理

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64


// include/linux/interrupt.h
enum
{
	HI_SOFTIRQ=0,
	TIMER_SOFTIRQ,
	NET_TX_SOFTIRQ,
	NET_RX_SOFTIRQ,
	BLOCK_SOFTIRQ,
	IRQ_POLL_SOFTIRQ,
	TASKLET_SOFTIRQ,
	SCHED_SOFTIRQ,
	HRTIMER_SOFTIRQ,
	RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */

	NR_SOFTIRQS
};
struct softirq_action
{
	void	(*action)(struct softirq_action *);
};
// net/core/dev.c
static int __init net_dev_init(void)
{
    /*
    * Incoming packets are placed on per-CPU queues
    */
    // struct softnet_data {}
    /*
	 *	Initialise the packet receive queues.
	 */

	for_each_possible_cpu(i) {
		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
		struct softnet_data *sd = &per_cpu(softnet_data, i);

		INIT_WORK(flush, flush_backlog);

		skb_queue_head_init(&sd->input_pkt_queue);
		skb_queue_head_init(&sd->process_queue);
#ifdef CONFIG_XFRM_OFFLOAD
		skb_queue_head_init(&sd->xfrm_backlog);
#endif
		INIT_LIST_HEAD(&sd->poll_list);
		sd->output_queue_tailp = &sd->output_queue;
#ifdef CONFIG_RPS
		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
		sd->cpu = i;
#endif

		init_gro_hash(&sd->backlog);
		sd->backlog.poll = process_backlog;
		sd->backlog.weight = weight_p;
	}
    ...
    open_softirq(NET_TX_SOFTIRQ, net_tx_action);
	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
    ...
}
// kernel/softirq.c
static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
void open_softirq(int nr, void (*action)(struct softirq_action *))
{
	softirq_vec[nr].action = action;
}

注册协议
inet_add_protocol将协议注册到inet_protos数组
dev_add_pack将ip_packet_type添加到ptype_base哈希表
其中添加逻辑是packet_type==ETH_P_ALL久添加到ptype_all 这也是libpcap抓包原理

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75


// net/ipv4/af_inet.c
static struct net_protocol tcp_protocol = {
	.early_demux	=	tcp_v4_early_demux,
	.early_demux_handler =  tcp_v4_early_demux,
	.handler	=	tcp_v4_rcv,
	.err_handler	=	tcp_v4_err,
	.no_policy	=	1,
	.icmp_strict_tag_validation = 1,
};
static struct net_protocol udp_protocol = {
	.early_demux =	udp_v4_early_demux,
	.early_demux_handler =	udp_v4_early_demux,
	.handler =	udp_rcv,
	.err_handler =	udp_err,
	.no_policy =	1,
};
static const struct net_protocol icmp_protocol = {
	.handler =	icmp_rcv,
	.err_handler =	icmp_err,
	.no_policy =	1,
};
static struct packet_type ip_packet_type __read_mostly = {
	.type = cpu_to_be16(ETH_P_IP),
	.func = ip_rcv,
	.list_func = ip_list_rcv,
};
static int __init inet_init(void){
    /*
	 *	Add all the base protocols.
	 */

	if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
		pr_crit("%s: Cannot add ICMP protocol\n", __func__);
	if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
		pr_crit("%s: Cannot add UDP protocol\n", __func__);
	if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
		pr_crit("%s: Cannot add TCP protocol\n", __func__);
    ...
	/* Register the socket-side information for inet_create. */
	for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
		INIT_LIST_HEAD(r);
    // 把inetsw_array中所有元素插入链表inetsw
	for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
		inet_register_protosw(q);
    ...

    dev_add_pack(&ip_packet_type);
}
// net/ipv4/protocol.c
struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
	return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
			NULL, prot) ? 0 : -1;
}
// net/core/dev.c
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
struct list_head ptype_all __read_mostly;	/* Taps */
static inline struct list_head *ptype_head(const struct packet_type *pt)
{
	if (pt->type == htons(ETH_P_ALL))
		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
	else
		return pt->dev ? &pt->dev->ptype_specific :
				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
}
void dev_add_pack(struct packet_type *pt)
{
	struct list_head *head = ptype_head(pt);

	spin_lock(&ptype_lock);
	list_add_rcu(&pt->list, head);
	spin_unlock(&ptype_lock);
}
EXPORT_SYMBOL(dev_add_pack);

注册网卡驱动igb_probe使网卡ready
注册igb_netdev_ops到netdev_ops，napi添加igb_poll
igb_netdev_ops中包含igb_open 其初始化资源如创建ringbuffer，注册中断处理函数igb_msix_ring 为每个队列设置单独MSI-X中断，启用napi等

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37


// drivers/net/ethernet/intel/igb/igb_main.c
static struct pci_driver igb_driver = {
	.name     = igb_driver_name,
	.id_table = igb_pci_tbl,
	.probe    = igb_probe,
	.remove   = igb_remove,
#ifdef CONFIG_PM
	.driver.pm = &igb_pm_ops,
#endif
	.shutdown = igb_shutdown,
	.sriov_configure = igb_pci_sriov_configure,
	.err_handler = &igb_err_handler
};
static int __init igb_init_module(void)
{
	ret = pci_register_driver(&igb_driver);
}
static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
    netdev->netdev_ops = &igb_netdev_ops;
}
static int igb_alloc_q_vector(struct igb_adapter *adapter,
			      int v_count, int v_idx,
			      int txr_count, int txr_idx,
			      int rxr_count, int rxr_idx)
{
	/* initialize NAPI */
	netif_napi_add(adapter->netdev, &q_vector->napi,
		       igb_poll, 64);
}
// 为每个RX队列设置单独MSI-X中断 设置中断处理函数igb_msix_ring
static int igb_request_msix(struct igb_adapter *adapter){
    for (i = 0; i < adapter->num_q_vectors; i++) {
        err = request_irq(adapter->msix_entries[vector].vector,
                  igb_msix_ring, 0, q_vector->name,
    }
}

当数据到时

igb_msix_ring
__raise_softirq_irqoff(NET_RX_SOFTIRQ)
__do_softirq
net_rx_action
napi_poll
igb_poll
igb_clean_rx_irq
napi_gro_receive->napi_skb_finish->gro_normal_one->gro_normal_list->netif_receive_skb_list_internal->__netif_receive_skb_list->__netif_receive_skb_list_core->__netif_receive_skb_core
deliver_skb
ip_rcv
ip_rcv_finish->dst_input->ip_local_deliver->ip_local_deliver_finish->ip_protocol_deliver_rcu->INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv, skb);
tcp_v4_rcv

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179


// drivers/net/ethernet/intel/igb/igb_main.c
// 硬中断处理函数
static irqreturn_t igb_msix_ring(int irq, void *data)
{
	struct igb_q_vector *q_vector = data;

	/* Write the ITR value calculated from the previous interrupt. */
	igb_write_itr(q_vector);

	napi_schedule(&q_vector->napi);

	return IRQ_HANDLED;
}
// include/linux/netdevice.h
// napi->poll_list加到sd->poll_list 前面说过sd是一个percpu queue也即是说硬中断在哪个cpu软中断也由哪个cpu处理
// 然后软中断调用NET_RX_SOFTIRQ处理函数
static inline void ____napi_schedule(struct softnet_data *sd,
				     struct napi_struct *napi)
{
	list_add_tail(&napi->poll_list, &sd->poll_list);
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
// kernel/softirq.c
void __raise_softirq_irqoff(unsigned int nr)
{
	lockdep_assert_irqs_disabled();
	trace_softirq_raise(nr);
	or_softirq_pending(1UL << nr);
}
static void run_ksoftirqd(unsigned int cpu)
{
	ksoftirqd_run_begin();
	if (local_softirq_pending()) {
		/*
		 * We can safely run softirq on inline stack, as we are not deep
		 * in the task stack here.
		 */
		__do_softirq();
		ksoftirqd_run_end();
		cond_resched();
		return;
	}
	ksoftirqd_run_end();
}
asmlinkage __visible void __softirq_entry __do_softirq(void)
{
    h = softirq_vec;
	while ((softirq_bit = ffs(pending))) {
        unsigned int vec_nr;    // 后面记录vec_nr作为日志 代码忽略
        h += softirq_bit - 1;
        // 前面net_dev_init->open_softirq注册软中断的时候注册了action函数
        h->action(h);
		h++;
        pending >>= softirq_bit;
    }
}
// drivers/net/ethernet/intel/igb/igb_main.c
// igb_poll - NAPI Rx polling callback
static int igb_poll(struct napi_struct *napi, int budget)
{
	if (q_vector->tx.ring)
		clean_complete = igb_clean_tx_irq(q_vector, budget);

	if (q_vector->rx.ring) {
		int cleaned = igb_clean_rx_irq(q_vector, budget);
}
static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
{
	struct igb_adapter *adapter = q_vector->adapter;
	struct igb_ring *rx_ring = q_vector->rx.ring;
	struct sk_buff *skb = rx_ring->skb;
    ...
    while (likely(total_packets < budget)) {
        rx_buffer = igb_get_rx_buffer(rx_ring, size, &rx_buf_pgcnt);
        napi_gro_receive(&q_vector->napi, skb);
    }
}
// net/core/dev.c
static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
				    struct packet_type **ppt_prev)
{
    // libpcap抓包点(注意这时候还没到协议栈)
    list_for_each_entry_rcu(ptype, &ptype_all, list) {
		if (pt_prev)
			ret = deliver_skb(skb, pt_prev, orig_dev);
		pt_prev = ptype;
	}

	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
		if (pt_prev)
			ret = deliver_skb(skb, pt_prev, orig_dev);
		pt_prev = ptype;
	}
}
static inline int deliver_skb(struct sk_buff *skb,
			      struct packet_type *pt_prev,
			      struct net_device *orig_dev)
{
	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
		return -ENOMEM;
	refcount_inc(&skb->users);
    // packet_type是之前dev_add_pack注册ip_packet_type func是ip_rcv
	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}
// net/ipv4/ip_input.c
// ip_rcv注册PREROUTING
// ip_local_deliver注册LOCALIN
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
	   struct net_device *orig_dev)
{
	struct net *net = dev_net(dev);

	skb = ip_rcv_core(skb, net);
	if (skb == NULL)
		return NET_RX_DROP;

	return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
		       net, NULL, skb, dev, NULL,
		       ip_rcv_finish);
}
ip_rcv_finish->dst_input->ip_local_deliver
int ip_local_deliver(struct sk_buff *skb)
{
	/*
	 *	Reassemble IP fragments.
	 */
	struct net *net = dev_net(skb->dev);

	if (ip_is_fragment(ip_hdr(skb))) {
		if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
			return 0;
	}

	return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
		       net, NULL, skb, skb->dev, NULL,
		       ip_local_deliver_finish);
}
ip_local_deliver_finish->ip_protocol_deliver_rcu
INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *));
void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
{
	const struct net_protocol *ipprot;
	int raw, ret;

resubmit:
	raw = raw_local_deliver(skb, protocol);

	ipprot = rcu_dereference(inet_protos[protocol]);
	if (ipprot) {
		if (!ipprot->no_policy) {
			if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
				kfree_skb_reason(skb,
						 SKB_DROP_REASON_XFRM_POLICY);
				return;
			}
			nf_reset_ct(skb);
		}
		ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
				      skb);
		if (ret < 0) {
			protocol = -ret;
			goto resubmit;
		}
		__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
	} else {
		if (!raw) {
			if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
				__IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
				icmp_send(skb, ICMP_DEST_UNREACH,
					  ICMP_PROT_UNREACH, 0);
			}
			kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO);
		} else {
			__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
			consume_skb(skb);
		}
	}
}

BIO/NIO

BIO

recvfrom有数据从sk_receive_queue接受数据没数据就放入sk_wq

recvfrom->inet_recvmsg->tcp_recvmsg->sk_wait_data->sk_sleep(sk)阻塞

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11


struct sock {
	struct sock_common	__sk_common;
#define sk_prot			__sk_common.skc_prot
	struct sk_buff_head	sk_receive_queue;
    union {
		struct socket_wq __rcu	*sk_wq;
		/* private: */
		struct socket_wq	*sk_wq_raw;
		/* public: */
	};
}

tcp_v4_rcv时tcp_rcv_established一边保存数据到sk_receive_queue一边sk_data_ready唤醒sk_wq的等待进程

1
2
3


                                            ->tcp_queue_rcv
tcp_v4_rcv->tcp_v4_do_rcv->tcp_rcv_established
                                            ->sk_data_ready->sock_def_readable

NIO

本质上epoll和阻塞io不同在于一个wake epoll wq中进程一个wake socket wq中进程
epoll内部维持rb_tree维持众多socket ，rdllist 和 wq是就绪队列和等待队列，每个项有func进行wake private指向的进程，rb_entry中的socket收到数据func是ep_call_back不需要唤醒socket对应进程所以private指向null，epoll wq需要唤醒epoll的进程所以其是default_wake_func 唤醒private指向current本身进程。其本质是将多个socket进行封装交由一个进程管理从而减少上下文切换造成的性能损耗。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10


struct eventpoll {
    /* Wait queue used by sys_epoll_wait() */
	wait_queue_head_t wq;
	/* Wait queue used by file->poll() */
	wait_queue_head_t poll_wait;
	struct list_head rdllist;
	struct rb_root_cached rbr;
	struct epitem *ovflist;
    struct file *file;
};

epoll_ctl Add添加socket并将其func设置成ep_poll_callback，因为在default_wake_function中是根据entry的private唤醒对应进程而ep_poll_callback只是通知epoll所以不需要设置这个entry的private

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85


// fs/eventpoll.c
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
		     struct file *tfile, int fd, int full_check)
{
    // 注册回调
    init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
    // 触发回调
    revents = ep_item_poll(epi, &epq.pt, 1);
}
static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
				 int depth)
{
	struct file *file = epi->ffd.file;
	__poll_t res;

	pt->_key = epi->event.events;
	if (!is_file_epoll(file))
		res = vfs_poll(file, pt);
	else
		res = __ep_eventpoll_poll(file, pt, depth);
	return res & epi->event.events;
}
static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
{
	return file->f_op->poll(file, pt);
}
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
				 poll_table *pt)
{
    init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
    if (epi->event.events & EPOLLEXCLUSIVE)
		add_wait_queue_exclusive(whead, &pwq->wait);
	else
        // ep_poll_callback放入socket等待队列whead
        // socket有数据到的时候会触发wq中的callback
        // 也就是说现在socket有数据时就不是唤醒创建socket的进程而是调用ep_poll_callback
		add_wait_queue(whead, &pwq->wait);
}
// net/ipv4/af_inet.c
const struct proto_ops inet_stream_ops = {
	.poll		   = tcp_poll,
}
// 这个inetsw_array的所有元素在inet_init的时候插入inetsw链表
static struct inet_protosw inetsw_array[] =
{
	{
		.type =       SOCK_STREAM,
		.protocol =   IPPROTO_TCP,
		.prot =       &tcp_prot,
		.ops =        &inet_stream_ops,
		.flags =      INET_PROTOSW_PERMANENT |
			      INET_PROTOSW_ICSK,
	},
}
// net/ipv4/tcp.c
__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
    sock_poll_wait(file, sock, wait);
}
// include/net/sock.h
static inline void sock_poll_wait(struct file *filp, struct socket *sock,
				  poll_table *p)
{
    poll_wait(filp, &sock->wq.wait, p);
}
// include/linux/poll.h
static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
	pt->_qproc = qproc;
	pt->_key   = ~(__poll_t)0; /* all events enabled */
}
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
	if (p && p->_qproc && wait_address)
		p->_qproc(filp, wait_address, p);
}
// include/linux/wait.h
// init_waitqueue_func_entry就是指定func不指定private
// init_waitqueue_entry就是默认当前进程default_wake_function
static inline void
init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func)
{
	wq_entry->flags		= 0;
	wq_entry->private	= NULL;
	wq_entry->func		= func;
}

epoll_wait从epoll->rdllist中取就绪socket，如果没有就创建等待事件添加到epoll->wq，然后epoll_wait进程阻塞

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21


// include/linux/wait.h
static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p)
{
	wq_entry->flags		= 0;
	wq_entry->private	= p;
	wq_entry->func		= default_wake_function;
}
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
    // 获取wait对应的epitem并通过item找到ep对象 wait是之前添加到socket wq上的entry
    struct epitem *epi = ep_item_from_wait(wait);
    struct eventpoll *ep = epi->ep;
    // epitem添加到就绪队列
        list_add_tail_lockless(&epi->rdllink, &ep->rdllist)
    // 查看ep->wq是否有符合条件的entry并唤醒 也就是epoll_wait
	if (waitqueue_active(&ep->wq)) {
        ...
        // 后续调用default_wake_function唤醒等待的进程
		wake_up(&ep->wq);
	}
}

内核发包

inet_sendmsg
sk->sock_prot->sendmsg
tcp_sendmsg
icsk->icsk_af_ops->queue_xmit(xkb)
ip_queue_xmit
ip_local_out
ip_finish_output2
dst_neigh_output
neigh_hh_output
dev_queue_xmit
dev_hard_start_xmit
igb_xmit_frame
igb_xmit_frame_ring

net_rx_action
igb_poll
igb_clean_tx_irq

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44


// net/ipv4/tcp_ipv4.c
const struct inet_connection_sock_af_ops ipv4_specific = {
	.queue_xmit	   = ip_queue_xmit,
	.send_check	   = tcp_v4_send_check,
	.rebuild_header	   = inet_sk_rebuild_header,
	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
	.conn_request	   = tcp_v4_conn_request,
	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
	.net_header_len	   = sizeof(struct iphdr),
	.setsockopt	   = ip_setsockopt,
	.getsockopt	   = ip_getsockopt,
	.addr2sockaddr	   = inet_csk_addr2sockaddr,
	.sockaddr_len	   = sizeof(struct sockaddr_in),
	.mtu_reduced	   = tcp_v4_mtu_reduced,
};
// net/ipv4/ip_output.c
// 这里是出的链 也是iptables localout 和postrouting链
int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	struct iphdr *iph = ip_hdr(skb);

	iph->tot_len = htons(skb->len);
	ip_send_check(iph);
    ...
	skb->protocol = htons(ETH_P_IP);

	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
		       net, sk, skb, NULL, skb_dst(skb)->dev,
		       dst_output);
}
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;

	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);

	skb->dev = dev;
	skb->protocol = htons(ETH_P_IP);

	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
			    net, sk, skb, indev, dev,
			    ip_finish_output,
			    !(IPCB(skb)->flags & IPSKB_REROUTED));
}

lo

lo也是正常path走一遍网络栈
与正常dev相比唯一不同的就是收到数据不再硬中断而是直接软中断收包
正常设备MTU1500 lo是虚拟设备MTU65536 可以减少分包减少资源消耗

1

ip route list table local

路由表其实本质就是找到网络设备
fib_lookup查找路由表设置dev 输入到lo

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81


// drivers/net/loopback.c
static netdev_tx_t loopback_xmit(struct sk_buff *skb,
				 struct net_device *dev)
{
	int len;

	skb_tx_timestamp(skb);

	/* do not fool net_timestamp_check() with various clock bases */
	skb_clear_tstamp(skb);

    // 剥离原socket联系
	skb_orphan(skb);

	/* Before queueing this packet to __netif_rx(),
	 * make sure dst is refcounted.
	 */
	skb_dst_force(skb);

	skb->protocol = eth_type_trans(skb, dev);

	len = skb->len;
	if (likely(__netif_rx(skb) == NET_RX_SUCCESS))
		dev_lstats_add(dev, len);

	return NETDEV_TX_OK;
}
// net/core/dev.c
__netif_rx->netif_rx_internal->enqueue_to_backlog
// queue an skb to a per CPU backlog queue
// path可能有点奇怪 其实是对的 先入队 满了再开软中断来消费
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
			      unsigned int *qtail)
{
    ...
    sd = &per_cpu(softnet_data, cpu);
    ...
    qlen = skb_queue_len(&sd->input_pkt_queue);
	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
		if (qlen) {
enqueue:
			__skb_queue_tail(&sd->input_pkt_queue, skb);
			input_queue_tail_incr_save(sd, qtail);
			rps_unlock_irq_restore(sd, &flags);
			return NET_RX_SUCCESS;
		}

		/* Schedule NAPI for backlog device
		 * We can use non atomic operation since we own the queue lock
		 */
		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
			napi_schedule_rps(sd);
		goto enqueue;
	}
    ...
}
// net/core/dev.c
static int __init net_dev_init(void)
{
    for_each_possible_cpu(i) {
        sd->backlog.poll = process_backlog;
    }
}
//input_pkt_queue->skb1->skb2->skb3
//process_queue
// --->
//input_pkt_queue
//process_queue->skb1->skb2->skb3
static int process_backlog(struct napi_struct *napi, int quota)
{
    while ((skb = __skb_dequeue(&sd->process_queue))) {
        rcu_read_lock();
        __netif_receive_skb(skb);
        rcu_read_unlock();
        input_queue_head_incr(sd);
        if (++work >= quota)
            return work;
    }
    skb_queue_splice_tail_init(&sd->input_pkt_queue,
						   &sd->process_queue);
}

tcp

连接建立

listen初始化全连接队列和半连接队列(两者长度有限)

accept从全连接队列上摘下

connect发送syn包 server假如半连接队列已满则drop 否则发送synack并加入半连接队列

收到youngack，加入全连接已满则drop

都是先改变状态再发包的

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50


// include/net/request_sock.h
struct request_sock_queue {
	spinlock_t		rskq_lock;
	u8			rskq_defer_accept;

	u32			synflood_warned;
	atomic_t		qlen;
	atomic_t		young;

    // 全连接队列
	struct request_sock	*rskq_accept_head;
	struct request_sock	*rskq_accept_tail;
    // 半连接队列 fastopen允许最后一个ack与请求信息同时发送
	struct fastopen_queue	fastopenq;  /* Check max_qlen != 0 to determine
					     * if TFO is enabled.
					     */
};
//file: net/ipv4/tcp_ipv4.c
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
    //看看半连接队列是否满了
    if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
        want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
        if (!want_cookie)
            goto drop;
    }

    //在全连接队列满的情况下，如果有 young_ack，那么直接丢
    if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
            goto drop;
    }
    ...
    //分配 request_sock 内核对象
    req = inet_reqsk_alloc(&tcp_request_sock_ops);

    //构造 syn+ack 包
    skb_synack = tcp_make_synack(sk, dst, req,
    fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);

    if (likely(!do_fastopen)) {
        //发送 syn + ack 响应
        err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
            ireq->rmt_addr, ireq->opt);

        //添加到半连接队列，并开启计时器
        inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
    }else
    ...
}

内存消耗

ESTABLISHED 3.3k

TIME_WAIT 0.3k

4G 100w

最大tcp连接数

1
2
3
4
5
6
7
8


最大文件上限
# 进程级别
fs.nropen 所有用户
nofile 针对不同用户
# 系统级别
fs.file-max	整个系统最大文件
# 
saddr、sport、daddr、dport理论巨大

容器网络

进程nsproxy

CLONE_NEWNET

bridge 走 iptables 135找nat表从一个端口送到另一个端口

1
2


skb->dev = to->dev;
dev_queue_xmit(skb);