This page looks best with JavaScript enabled

libpcap、tcpdump以及kernel/net的部分源码阅读

 ·  ☕ 17 min read · 👀... views

Begin

本文起因是操作系统课设任务新增系统调用,当时想法是新增一个网络抓包功能的系统调用,类似于简化版的tcpdump和kernel的缝合,所以就开始尝试阅读libpcap、tcpdunmp及kernel/net部分源码。由于系统代码量大,理清架构有一定难度,所以一直拖到现在才完成部分内容。

本文libpcap版本:1.10.1(阅读时最新的release[2021年12月])
kernel5.15.6

抓包的历史主要分成两个

一个是远古版本socket(PF_INET)+recvfrom() 这样需要多次内存拷贝和系统调用 效率较低
还有一个是现在使用的scoket(PF_PACKET)+memory-mapped 减少系统调用和内存拷贝 效率较高

libpcap常用接口

一般情况下使用libpcap库抓包会用到下面几个接口函数

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
// 1.打开一个用于捕获数据的网络接口
// pcap.c
pcap_t	*pcap_open_live(const char *, int, int, int, char *);
/*
	const char * device	网络接口名字 为第一步获取的网络接口字符串(pcap_lookupdev()返回值),可人为指定,例如"eth0"
	int snaplen			捕获数据包长度
	int promise 		混杂模式-1 非混杂模式-0
	int to_ms			需要等待的毫秒数
	char *errbuf		存储错误信息
*/

// 2.获取指定网卡的 ip 地址,子网掩码
// pcap.c
int	pcap_lookupnet(const char *, bpf_u_int32 *, bpf_u_int32 *, char *);
/*
	const char *device	网络设备名	可以人为指定eth0 可以是pcap_lookupdev()的返回值
	bpf_u_int32 *netp	存放ip的指针
	bpf_u_int32 *maskp	存放子网掩码的指针
	char *errbuf		存放错误信息
*/


// 3.编译BPF过滤规则
// gencode.c
int	pcap_compile(pcap_t *, struct bpf_program *, const char *, int,
	    bpf_u_int32);
/*
	pcap_t *p				pcap_open_live()创建并返回的 pcap_t 类型的指针
	struct bpf_program *fp	存放编译后的BPF pcap_setfilter()需要传入这个指针
	const char *buf			过滤条件
	int optimize			是否需要优化过滤表达式
	bpf_u_int32 mask		指定本地网络的网络掩码
*/


// 4.设置过滤规则
// pcap.c
int	pcap_setfilter(pcap_t *, struct bpf_program *);
/*
	pcap_t *p				pcap_open_live()创建并返回的 pcap_t 类型的指针
	struct bpf_program *fp	pcap_compile()编译好的BPF过滤规则	[第二个参数]
*/

// 5.循环捕获网络数据包
int	pcap_loop(pcap_t *, int, pcap_handler, u_char *);
/*
	pcap_t *p				pcap_open_live()创建并返回的 pcap_t 类型的指针
	int cnt					指定捕获数据包个数 抓到了指定数目就会立刻返回 -1无限循环捕获
	pcap_handler callback	回调函数,处理捕获到的数据包
	u_char *user			向回调中传递的参数

	typedef void (*pcap_handler)(u_char *, const struct pcap_pkthdr *,
			     const u_char *);
				u_char * user						pcap_loop()最后一个参数
				const struct pcap_pkthdr *pkthdr	结构体指针 pcap_next()参数一致
				const u_char * packet				收到的数据包数据
*/


// 6.释放网络接口
void pcap_close(pcap_t *);
/*
	pcap_t *p	pcap_open_live()创建并返回的 pcap_t 类型的指针
*/

抓包方式

socket + recvfrom

1
2
3
sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
len = recvfrom(sock, buffer, BUFFER_MAX, 0, NULL, NULL);
buffer

Tcpdump

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
int main{
	...
	pc = pcap_create(device, ebuf);
	...
	status = pcap_activate(pc);
	...
	if (WFileName) {
		...
		if (Cflag != 0 || Gflag != 0) {
			...
			callback = dump_packet_and_trunc;
			...
		}else{
			callback = dump_packet;
			...
		}
	}else{
		...
		callback = print_packet;
		...
	}
	...
	do{
		status = pcap_loop(pd, cnt, callback, pcap_userdata);
		...
	}
}

例程-pcap_loop

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
static void ethernet_packet(u_char *args, const struct pcap_pkthdr *header, const u_char *packet) {
	// 程序对抓到包的处理
	return;
}

static void * sniffer_thread_callback(void *param) {
	pthread_detach(pthread_self());
	printf("sniffer_thread_callback enter\n");
	int nDev =-(TYPE32)param;

	char errbuf[PCAP_ERRBUF_SIZE];
	pcap_t *handle;

	//bug  modify flag  jl
	char *filter_exp = "not broadcast and not arp";
	struct bpf_program fp;
	bpf_u_int32 mask = 0;
	bpf_u_int32 net = 0;

	handle = pcap_open_live(option_struct.dev_interface[nDev], 65535, 1, 0, errbuf);
	if (handle == NULL) {
		fprintf(stderr, "couldn't open device %s: %s\n", option_struct.dev_interface[nDev], errbuf);
		pthread_exit(NULL);
		return NULL;
	}

	pcap_lookupnet(option_struct.dev_interface[nDev], &net, &mask, errbuf);

	if (pcap_datalink(handle) != DLT_EN10MB) {
		fprintf(stderr, "couldn't pcap_datalink %s\n", pcap_geterr(handle));
		pcap_close(handle);
		pthread_exit(NULL);
		return NULL;
	}
	if (pcap_compile(handle, &fp, filter_exp, 0, net) == -1) {
		fprintf(stderr, "couldn't parse filter %s: %s\n", filter_exp, pcap_geterr(handle));
		pcap_freecode(&fp);
		pcap_close(handle);
		pthread_exit(NULL);
		return NULL;
	}

	if (pcap_setfilter(handle, &fp) == -1) {
		fprintf(stderr, "couldn't install filter %s: %s\n", filter_exp, pcap_geterr(handle));
		pcap_freecode(&fp);
		pcap_close(handle);
		pthread_exit(NULL);
		return NULL;
	}

	if (pcap_loop(handle, -1, ethernet_packet, (u_char *)&nDev) == -1) {
		fprintf(stderr, "couldn't pacp loop  %s: %s\n", filter_exp, pcap_geterr(handle));
	}

	pcap_freecode(&fp);
	pcap_close(handle);
	pthread_exit(NULL);
	return NULL;
}

libpcap结构体

使用的重要结构体如下

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
// pcap-int.h
struct pcap {
	read_op_t read_op; // Method to call to read packets on a live capture.
	next_packet_op_t next_packet_op; // Method to call to read the next packet from a savefile.

	/*
	 * Read buffer.
	 */
	u_int bufsize;
	void *buffer;
	u_char *bp;
	int cc;

	sig_atomic_t break_loop; /* flag set to force break from packet-reading loop */

	void *priv;		/* private data for methods */

	// ...
}
// pcap-int.h
struct pcap_opt {
	char	*device;
	int	timeout;	/* timeout for buffering */
	u_int	buffer_size;
	int	promisc;
	int	rfmon;		/* monitor mode */
	int	immediate;	/* immediate mode - deliver packets as soon as they arrive */
	int	nonblock;	/* non-blocking mode - don't wait for packets to be delivered, return "no packets available" */
	int	tstamp_type;
	int	tstamp_precision;

    // linux
	int	protocol;	/* protocol to use when creating PF_PACKET socket */
    // WIN32
    int	nocapture_local;
};
// pcap-linux.c
struct pcap_linux {
	long long sysfs_dropped; /* packets reported dropped by /sys/class/net/{if_name}/statistics/rx_{missed,fifo}_errors */
	struct pcap_stat stat;

	char	*device;	/* device name */
	int	filter_in_userland; /* must filter in userland */
	int	blocks_to_filter_in_userland;
	int	must_do_on_close; /* stuff we must do when we close */
	int	timeout;	/* timeout for buffering */
	int	cooked;		/* using SOCK_DGRAM rather than SOCK_RAW */
	int	ifindex;	/* interface index of device we're bound to */
	int	lo_ifindex;	/* interface index of the loopback device */
	int	netdown;	/* we got an ENETDOWN and haven't resolved it */
	bpf_u_int32 oldmode;	/* mode to restore when turning monitor mode off */
	char	*mondevice;	/* mac80211 monitor device we created */
	u_char	*mmapbuf;	/* memory-mapped region pointer */
	size_t	mmapbuflen;	/* size of region */
	int	vlan_offset;	/* offset at which to insert vlan tags; if -1, don't insert */
	u_int	tp_version;	/* version of tpacket_hdr for mmaped ring */
	u_int	tp_hdrlen;	/* hdrlen of tpacket_hdr for mmaped ring */
	u_char	*oneshot_buffer; /* buffer for copy of packet */
	int	poll_timeout;	/* timeout to use in poll() */
#ifdef HAVE_TPACKET3
	unsigned char *current_packet; /* Current packet within the TPACKET_V3 block. Move to next block if NULL. */
	int packets_left; /* Unhandled packets left within the block from previous call to pcap_read_linux_mmap_v3 in case of TPACKET_V3. */
#endif
	int poll_breakloop_fd; /* fd to an eventfd to break from blocking operations */
};
// pcap/pcap.h
struct pcap_pkthdr {
	struct timeval ts;	/* time stamp */
	bpf_u_int32 caplen;	/* length of portion present */
	bpf_u_int32 len;	/* length of this packet (off wire) */
};

pcap_open_live

pcap_open_live简略代码如下,可以认为只调用了两个函数,一个pcap_create,一个pcap_activate

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
pcap_t *
pcap_open_live(const char *device, int snaplen, int promisc, int to_ms, char *errbuf)
{
	pcap_t *p;
	int status;
	p = pcap_create(device, errbuf);
	// 省略p设置默认值
	status = pcap_activate(p);
	return p;
}

pcap_create

pcap_create()调用链如下,设置了两个函数指针,一个是activate_op,为后续activate
另一个开启了无线模式(默认pcap_create_common创建时关闭的)
configure中默认是$as_echo “#define HAVE_LIBNL 1” »confdefs.h
然后pcap_can_set_rfmon_linux()检测mac80211 device,如果存在就默认支持无线模式

pcap_open_live()中[p = pcap_create(device, errbuf);]->pcap_create()->pcap_create_interface()->pcap_create_common()
// pcap-linux.c中pcap_create_interface()为pcap_t设置具体函数指针,主要是下面两个
handle->activate_op = pcap_activate_linux;
handle->can_set_rfmon_op = pcap_can_set_rfmon_linux;

pcap_create()简化代码如下
tcpdump的main函数就是直接调用pcap_create()和pcap_activate(),然后用pcap_loop()循环收包
https://github.com/the-tcpdump-group/tcpdump/blob/master/tcpdump.c#L1270
https://github.com/the-tcpdump-group/tcpdump/blob/master/tcpdump.c#L2600

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
pcap_t *
pcap_create(const char *device, char *errbuf)
{
	size_t i;
	int is_theirs;
	pcap_t *p;
	char *device_str;

	if (device == NULL)
		device_str = ("any");
	else {
		device_str = strdup(device);
	}

	p = pcap_create_interface(device_str, errbuf);
	p->opt.device = device_str;
	return (p);
}

pcap_activate

pcap_activate()调用链如下,pcap_t的active_op方法,启用之前设置的pcap_activate_linux,接着调用activate_pf_packet,其中用socket系统调用创建原始套接字

<!-- pcap_open_live()中[status = pcap_activate(p);] -->
pcap_activate()->pcap_activate_linux()->activate_pf_packet()->socket()
										->setup_mmapped()->prepare_tpacket_socket()->init_tpacket()->getsockopt()&setsockopt()
														->create_ring()
1
2
3
4
5
// pcap_activate()
// 调用上面pcap_open_live()调用的pcap_create_interface()中创建的函数指针 pcap_activate_linux;
// 并且检测是否设置了non-blocking mode,是的话就调用pcap_activate_linux()中设置的函数指针pcap_setnonblock_linux
status = p->activate_op(p);
status = p->setnonblock_op(p, 1);

pcap_activate()简化代码如下

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
int
pcap_activate(pcap_t *p)
{
	int status;
	status = p->activate_op(p);

	// 如果调用pcap_activate()前设置了non-blocking mode, 现在打开它
	if (p->opt.nonblock) {
		status = p->setnonblock_op(p, 1);	// 调用pcap_setnonblock_linux
	}
	p->activated = 1;
	return (status);
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
// pcap_activate_linux()
	// ......
	ret = activate_pf_packet(handle, is_any_device);
	// ......
	ret = setup_mmapped(handle, &status);

	// 实现一系列函数指针的赋值


	// 捕获包会调用这个接口
	// 内核mmap的方式来抓取报文
	// PACKET_MMAP非常高效,它提供一个映射到用户空间的大小可配置的ring buffer
	// 这种方式,读取报文只需要等待报文就可以了,大部分情况下不需要系统调用
	// 通过内核空间和用户空间共享的缓冲区还可以起到减少数据拷贝的作用
	
	switch (handlep->tp_version) {

	case TPACKET_V2:
		handle->read_op = pcap_read_linux_mmap_v2;
		break;
#ifdef HAVE_TPACKET3
	case TPACKET_V3:
		handle->read_op = pcap_read_linux_mmap_v3;
		break;
#endif
	}
	// centos 7是关闭v3的
	// * Tue Dec 02 2014 Michal Sekletar <msekleta@redhat.com> - 14:1.5.3-4
	// - disable TPACKET_V3 memory mapped packet capture on AF_PACKET socket, use TPACKET_V2 instead (#1085096)

activate_pf_packet

简单来说就是创建PF_PACKET套接字

1
2
3
4
5
6
// activate_pf_packet()
	sock_fd = is_any_device ?
		socket(PF_PACKET, SOCK_DGRAM, 0) :
		socket(PF_PACKET, SOCK_RAW, 0);
	...
	handle->fd = sock_fd;

setup_mmapped

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// pcap-linux.c
static int
setup_mmapped(pcap_t *handle, int *status)
{
	struct pcap_linux *handlep = handle->priv;
	int ret;

	/*
	 * Attempt to allocate a buffer to hold the contents of one
	 * packet, for use by the oneshot callback.
	 */
	handlep->oneshot_buffer = malloc(handle->snapshot);

	// ...

	if (handle->opt.buffer_size == 0) {
		/* 默认申请 2M 的ring buffer */
		handle->opt.buffer_size = 2*1024*1024;
	}
	ret = prepare_tpacket_socket(handle);
	// ...
	ret = create_ring(handle, status);
	// ...

	/*
	 * handle->offset is used to get the current position into the rx ring.
	 * handle->cc is used to store the ring size.
	 */

	/*
	 * Set the timeout to use in poll() before returning.
	 */
	set_poll_timeout(handlep);

	return 1;
}

prepare_tpacket_socket()中根据配置是否支持TPACKET_V3来调用init_tpacket(),没有v3的话就尝试v2,v2的init_tpacket()返回值-1失败0成功1内核版本过老支持MMAP但不支持v2.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
// pcap-linux.c
static int prepare_tpacket_socket(pcap_t *handle)
{
	int ret;
#ifdef HAVE_TPACKET3
	ret = init_tpacket(handle, TPACKET_V3, "TPACKET_V3");
	return ret;
#endif
	ret = init_tpacket(handle, TPACKET_V2, "TPACKET_V2");
	return ret;
}

init_tpacket 探测内核是否支持指定的TPACKET版本 支持的话就通过系统调用初始化tpacket 没啥好说的

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
// pcap-linux.c
static int
init_tpacket(pcap_t *handle, int version, const char *version_str)
{
	struct pcap_linux *handlep = handle->priv;
	int val = version;
	socklen_t len = sizeof(val);
	if (getsockopt(handle->fd, SOL_PACKET, PACKET_HDRLEN, &val, &len) < 0) {
		err...
	}
	/* hdrlen of tpacket_hdr for mmaped ring */
	handlep->tp_hdrlen = val;
	val = version;
	if (setsockopt(handle->fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val)) < 0) {
		err...
	}
	/* version of tpacket_hdr for mmaped ring */
	handlep->tp_version = version;

	return 0;
}

create_ring()主要是环的创建
先是根据版本创建对应tpacket_req结构体并设置参数
然后通过setsockopt设置PACKET_RX_RING
memory map the rx ring(mmap是系统调用)
创建一个环,然后为环中每个header设置frame ptr

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
// create_ring()
struct tpacket_req {
	unsigned int	tp_block_size;	/* Minimal size of contiguous block */
	unsigned int	tp_block_nr;	/* Number of blocks */
	unsigned int	tp_frame_size;	/* Size of frame */
	unsigned int	tp_frame_nr;	/* Total number of frames */
};

struct tpacket_req3 {
	unsigned int	tp_block_size;	/* Minimal size of contiguous block */
	unsigned int	tp_block_nr;	/* Number of blocks */
	unsigned int	tp_frame_size;	/* Size of frame */
	unsigned int	tp_frame_nr;	/* Total number of frames */
	unsigned int	tp_retire_blk_tov; /* timeout in msecs */
	unsigned int	tp_sizeof_priv; /* offset to private data area */
	unsigned int	tp_feature_req_word;
};

if (setsockopt(handle->fd, SOL_PACKET, PACKET_RX_RING,
				(void *) &req, sizeof(req))) {
	return -1;
}

/* memory map the rx ring */
handlep->mmapbuflen = req.tp_block_nr * req.tp_block_size;
handlep->mmapbuf = mmap(0, handlep->mmapbuflen,
	PROT_READ|PROT_WRITE, MAP_SHARED, handle->fd, 0);

/* allocate a ring for each frame header pointer*/
handle->cc = req.tp_frame_nr;
handle->buffer = malloc(handle->cc * sizeof(union thdr *));

/* fill the header ring with proper frame ptr*/
handle->offset = 0;
for (i=0; i<req.tp_block_nr; ++i) {
	u_char *base = &handlep->mmapbuf[i*req.tp_block_size];
	for (j=0; j<frames_per_block; ++j, ++handle->offset) {
		RING_GET_CURRENT_FRAME(handle) = base;
		base += req.tp_frame_size;
	}
}

handle->bufsize = req.tp_frame_size;
handle->offset = 0;
return 1;

pcap_read_linux_mmap_v3

就是从内核循环收包并调用回调函数的过程

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
// pcap-linux.c
// 参数跟pcap_loop完全相同 实际上pcap_loop就是readop的wrapper 往上翻可以看用法 往下翻有pcap_loop的源码
static int
pcap_read_linux_mmap_v3(pcap_t *handle, int max_packets, pcap_handler callback,
		u_char *user)
{
	struct pcap_linux *handlep = handle->priv;
	union thdr h;
	int pkts = 0;
	int ret;

again:
	h.raw = RING_GET_CURRENT_FRAME(handle);
	// ...
	/* non-positive values of max_packets are used to require all
	 * packets currently available in the ring */
	while ((pkts < max_packets) || PACKET_COUNT_IS_UNLIMITED(max_packets)) {
		int packets_to_read;

		if (handlep->current_packet == NULL) {
			h.raw = RING_GET_CURRENT_FRAME(handle);
			if (!packet_mmap_v3_acquire(h.h3))
				break;

			handlep->current_packet = h.raw + h.h3->hdr.bh1.offset_to_first_pkt;
			handlep->packets_left = h.h3->hdr.bh1.num_pkts;
		}
		packets_to_read = handlep->packets_left;

		if (!PACKET_COUNT_IS_UNLIMITED(max_packets) &&
		    packets_to_read > (max_packets - pkts)) {
			/*
			 * We've been given a maximum number of packets
			 * to process, and there are more packets in
			 * this buffer than that.  Only process enough
			 * of them to get us up to that maximum.
			 */
			packets_to_read = max_packets - pkts;
		}

		while (packets_to_read-- && !handle->break_loop) {
			struct tpacket3_hdr* tp3_hdr = (struct tpacket3_hdr*) handlep->current_packet;
			ret = pcap_handle_packet_mmap(
					handle,
					callback,
					user,
					handlep->current_packet,
					tp3_hdr->tp_len,
					tp3_hdr->tp_mac,
					tp3_hdr->tp_snaplen,
					tp3_hdr->tp_sec,
					handle->opt.tstamp_precision == PCAP_TSTAMP_PRECISION_NANO ? tp3_hdr->tp_nsec : tp3_hdr->tp_nsec / 1000,
					VLAN_VALID(tp3_hdr, &tp3_hdr->hv1),
					tp3_hdr->hv1.tp_vlan_tci,
					VLAN_TPID(tp3_hdr, &tp3_hdr->hv1));
			if (ret == 1) {
				pkts++;
			} else if (ret < 0) {
				handlep->current_packet = NULL;
				return ret;
			}
			handlep->current_packet += tp3_hdr->tp_next_offset;
			handlep->packets_left--;
		}

		if (handlep->packets_left <= 0) {
			/*
			 * Hand this block back to the kernel, and, if
			 * we're counting blocks that need to be
			 * filtered in userland after having been
			 * filtered by the kernel, count the one we've
			 * just processed.
			 */
			packet_mmap_v3_release(h.h3);
			if (handlep->blocks_to_filter_in_userland > 0) {
				handlep->blocks_to_filter_in_userland--;
				if (handlep->blocks_to_filter_in_userland == 0) {
					/*
					 * No more blocks need to be filtered
					 * in userland.
					 */
					handlep->filter_in_userland = 0;
				}
			}

			/* next block */
			if (++handle->offset >= handle->cc)
				handle->offset = 0;

			handlep->current_packet = NULL;
		}

		/* check for break loop condition*/
		if (handle->break_loop) {
			handle->break_loop = 0;
			return PCAP_ERROR_BREAK;
		}
	}
	if (pkts == 0 && handlep->timeout == 0) {
		/* Block until we see a packet. */
		goto again;
	}
	return pkts;
}

pcap_wait_for_frames_mmap->poll

pcap_compile&&pcap_setfilter

pcap_compile中涉及词法分析等编译原理内容,十分复杂(代码量是核心部分两倍),暂时不深入研究

此处可通过tcpdump的-d参数以类汇编指令输出编译的packet-matching code

官方解释:Dump the compiled packet-matching code in a human readable form to standard output and stop.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
> sudo tcpdump host 192.168.142.1 -d
(000) ldh      [12]
(001) jeq      #0x800           jt 2    jf 6
(002) ld       [26]
(003) jeq      #0xc0a88e01      jt 12   jf 4
(004) ld       [30]
(005) jeq      #0xc0a88e01      jt 12   jf 13
(006) jeq      #0x806           jt 8    jf 7
(007) jeq      #0x8035          jt 8    jf 13
(008) ld       [28]
(009) jeq      #0xc0a88e01      jt 12   jf 10
(010) ld       [38]
(011) jeq      #0xc0a88e01      jt 12   jf 13
(012) ret      #262144
(013) ret      #0
jeq = judge equal
jt = judge true
jf = judge false
ld = load?
上方一共14行代码
(000)跳到12字节处
// PS:以太网头目标MAC6字节 源MAC字节
(001)判断是否等于0x800 等于跳到(002) 不等跳到(006)
// PS:此处0x800代表的是IPv4
(002)跳转到26字节
// PS:以太网帧14字节(12字节MAC+2字节Type),IPv4头20字节 最后8字节为源地址和目标地址 26=14+20-8 也就是跳到源地址的地方
(003)继续往下执行 此处判断是否等于0xc0a88e01 相等跳到(012) 不等跳到(0004)
// PS:0xc0a88e01就是tcpdump命令中输入地址192.168.142.1的十六进制表示 此处判断源地址是否为指定地址,是的话就跳到(012)直接ret
(004)跳到30字节处
// PS:如果源地址不是我们指定的地址就跳到(004),同刚才源地址的计算30=14+20-4
(005)判断是否为指定的ip 是的话跳到(012) 不是的话跳到(013)
// PS:也就是说如果不是源地址就判断是不是目标地址 是的话就ret #262144
(006)判断是否等于0x806 等于跳到(008) 不等跳到(007)
// PS:不等于0x800后跳转到此处继续判断是否等于0x806 0x806表示的是ARP协议
(007)判断是否等于0x8035 等于跳转(008) 不等跳转(013)
// PS:
(008)跳转到28字节处
// PS:ARP协议头为28字节,最后为6字节源Mac 4字节源ip 6字节目标Mac 4字节目标ip,28=14+28-(4+6+4) 也就是跳到源ip处
(009)判断是否为指定ip 是跳转(012) 不是跳转(010)
// PS: 判断ARP协议的源ip是否为我们指定ip 
(010)跳转到38字节处
// PS:同上方arp源ip偏移计算此处38=14+28-4 
(011)判断是否为只当ip 是挑战(012) 不是跳转(013)
// PS:即判断目标ip是否为指定ip
(012) ret #262144
(013) ret #0

pcap_setfilter调用链,最终调用setsockopt通知内核启用过滤

pcap_setfilter->pcap_setfilter_linux->set_kernel_filter->setsockopt
setsockopt(handle->fd, SOL_SOCKET, SO_ATTACH_FILTER, 
			&total_fcode, sizeof(total_fcode))

pcap_loop

pcap_loop的逻辑十分简单,如果p->rfile不为空就存入文件,否则调用p->read_op()

而我们的read_op()是在pcap_activate_linux()中根据版本设置的函数指针pcap_read_linux_mmap_v3/pcap_read_linux_mmap_v2

(网上看其他人对老版本的源码分析pcap_loop中还是不断recvfrom来循环收包)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
int pcap_loop(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
{
	register int n;

	for (;;) {
		if (p->rfile != NULL) {
			// 0 means EOF, so don't loop if we get 0.
			n = pcap_offline_read(p, cnt, callback, user);
		} else {
			do {
				n = p->read_op(p, cnt, callback, user);
			} while (n == 0);
		}
	}
}

kernel/net结构体

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
// include/net/sock.h
// 网络层套接字 network layer representation of sockets
struct sock {

}
// net/packet/internal.h
struct packet_sock {
	/* struct sock has to be the first member of packet_sock */
	struct sock		sk;
	struct packet_fanout	*fanout;
	union  tpacket_stats_u	stats;
	struct packet_ring_buffer	rx_ring;
	struct packet_ring_buffer	tx_ring;
	int			copy_thresh;
	spinlock_t		bind_lock;
	struct mutex		pg_vec_lock;
	unsigned int		running;	/* bind_lock must be held */
	unsigned int		auxdata:1,	/* writer must hold sock lock */
				origdev:1,
				has_vnet_hdr:1,
				tp_loss:1,
				tp_tx_has_off:1;
	int			pressure;
	int			ifindex;	/* bound device		*/
	__be16			num;
	struct packet_rollover	*rollover;
	struct packet_mclist	*mclist;
	atomic_t		mapped;
	enum tpacket_versions	tp_version;
	unsigned int		tp_hdrlen;
	unsigned int		tp_reserve;
	unsigned int		tp_tstamp;
	struct completion	skb_completion;
	struct net_device __rcu	*cached_dev;
	int			(*xmit)(struct sk_buff *skb);
	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
	atomic_t		tp_drops ____cacheline_aligned_in_smp;
};
// include/linux/netdevice.h
struct packet_type {
	__be16			type;	/* This is really htons(ether_type). */
	bool			ignore_outgoing;
	struct net_device	*dev;	/* NULL is wildcarded here	     */
	int			(*func) (struct sk_buff *,
					 struct net_device *,
					 struct packet_type *,
					 struct net_device *);
	void			(*list_func) (struct list_head *,
					      struct packet_type *,
					      struct net_device *);
	bool			(*id_match)(struct packet_type *ptype,
					    struct sock *sk);
	void			*af_packet_priv;
	struct list_head	list;
};

kernel/net 源码

net/packet/af_packet.c/packet_init

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static int __init packet_init(void)
{
	int rc;

	rc = proto_register(&packet_proto, 0);
	if (rc)
		goto out;
	rc = sock_register(&packet_family_ops);
	if (rc)
		goto out_proto;
	rc = register_pernet_subsys(&packet_net_ops);
	if (rc)
		goto out_sock;
	rc = register_netdevice_notifier(&packet_netdev_notifier);
	if (rc)
		goto out_pernet;

	return 0;

out_pernet:
	unregister_pernet_subsys(&packet_net_ops);
out_sock:
	sock_unregister(PF_PACKET);
out_proto:
	proto_unregister(&packet_proto);
out:
	return rc;
}

module_exit(packet_exit);

module_init模块初始化 程序运行的时候会自动调用packet_init初始化模块

sock_register 本质是将 struct net_proto_family packet_family_ops 注册到socket.c一个全局的 *net_families[NPROTO] 变量中

net/socket.c

系统通过socket()建立通信

socket() 本质上是调用了 sock_create() 函数并用sock_map_fd()将socket绑定到文件描述符上

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
int __sock_create(struct net *net, int family, int type, int protocol,
			 struct socket **res, int kern)
{
	int err;
	struct socket *sock;
	const struct net_proto_family *pf;

	......

#ifdef CONFIG_MODULES
	/* Attempt to load a protocol module if the find failed.
	 *
	 * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
	 * requested real, full-featured networking support upon configuration.
	 * Otherwise module support will break!
	 */
	if (rcu_access_pointer(net_families[family]) == NULL)
		request_module("net-pf-%d", family);
#endif

	rcu_read_lock();
	pf = rcu_dereference(net_families[family]);
	err = -EAFNOSUPPORT;
	if (!pf)
		goto out_release;

	/*
	 * We will call the ->create function, that possibly is in a loadable
	 * module, so we have to bump that loadable module refcnt first.
	 */
	if (!try_module_get(pf->owner))
		goto out_release;

	/* Now protected by module ref count */
	rcu_read_unlock();

	err = pf->create(net, sock, protocol, kern);
	if (err < 0)
		goto out_module_put;

	......

	*res = sock;

	return 0;

out_module_busy:
	err = -EAFNOSUPPORT;
out_module_put:
	sock->ops = NULL;
	module_put(pf->owner);
out_sock_release:
	sock_release(sock);
	return err;

out_release:
	rcu_read_unlock();
	goto out_sock_release;
}
EXPORT_SYMBOL(__sock_create);

这里通过pf->create(net, sock, protocol, kern); 调用之前pf_packet init时绑定到 net_families[family]的create方法即: packet_create

net/packet/af_packet.c/packet_create

如注释所写,这个函数Create a packet of type SOCK_PACKET.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
static int packet_create(struct net *net, struct socket *sock, int protocol,
			 int kern)
{
	struct sock *sk;
	struct packet_sock *po;
	__be16 proto = (__force __be16)protocol; /* weird, but documented */
	int err;

	if (!ns_capable(net->user_ns, CAP_NET_RAW))
		return -EPERM;
	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
	    sock->type != SOCK_PACKET)
		return -ESOCKTNOSUPPORT;

	sock->state = SS_UNCONNECTED;

	err = -ENOBUFS;
	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);		// 创建socket
	if (sk == NULL)
		goto out;

	sock->ops = &packet_ops;	// 设置一系列操作函数
	if (sock->type == SOCK_PACKET)
		sock->ops = &packet_ops_spkt;

	sock_init_data(sock, sk);

	// 将sock结构体sk设置到packet_sock结构体po中 (struct sock has to be the first member of packet_sock)
	po = pkt_sk(sk);
	init_completion(&po->skb_completion);
	sk->sk_family = PF_PACKET;
	po->num = proto;
	po->xmit = dev_queue_xmit;

	err = packet_alloc_pending(po);
	if (err)
		goto out2;

	packet_cached_dev_reset(po);

	sk->sk_destruct = packet_sock_destruct;
	sk_refcnt_debug_inc(sk);

	/*
	 *	Attach a protocol block
	 */

	spin_lock_init(&po->bind_lock);
	mutex_init(&po->pg_vec_lock);
	po->rollover = NULL;
	po->prot_hook.func = packet_rcv;	// packet_type 结构体(packet_sock结构体中prot_hook是packet_type类型) 注册的处理函数 packet_rcv

	if (sock->type == SOCK_PACKET)
		po->prot_hook.func = packet_rcv_spkt;

	po->prot_hook.af_packet_priv = sk;

	if (proto) {
		po->prot_hook.type = proto;
		__register_prot_hook(sk);	// 调用 net/core/dev.c/dev_add_pack()  添加到内核 ptype_all 链表上(kernel lists)
	}

	mutex_lock(&net->packet.sklist_lock);
	sk_add_node_tail_rcu(sk, &net->packet.sklist);
	mutex_unlock(&net->packet.sklist_lock);

	preempt_disable();
	sock_prot_inuse_add(net, &packet_proto, 1);
	preempt_enable();

	return 0;
out2:
	sk_free(sk);
out:
	return err;
}

如type注释所说this is really htons(ether_type)

packet_type 结构体第一个type 很重要 ,对应链路层中2个字节的以太网类型。而dev.c 链路层抓取的包上报给对应模块,就是根据抓取的链路层类型,然后给对应的模块处理,例如socket(PF_PACKET, SOCK_DGRAM, htons(ETH_P_ALL)); ETH_P_ALL表示所有的底层包都会给到PF_PACKET 模块的处理函数,这里处理函数就是 packet_type.func 也就是在packet_create中设置的packet_rcv函数.

1
2
3
4
5
6
7
8
9
static inline int deliver_skb(struct sk_buff *skb,
			      struct packet_type *pt_prev,
			      struct net_device *orig_dev)
{
	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
		return -ENOMEM;
	refcount_inc(&skb->users);
	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}

net/packet/af_packet.c/packet_rcv

而对于packet_rcv()
注释是This function makes lazy skb cloning in hope that most of packets are discarded by BPF.
懒拷贝 希望大部分包能被BPF丢弃

注释还提到因为packets是由dev_queue_xmit_nit克隆而来

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
/*
 * This function makes lazy skb cloning in hope that most of packets
 * are discarded by BPF.
 *
 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
 * and skb->cb are mangled. It works because (and until) packets
 * falling here are owned by current CPU. Output packets are cloned
 * by dev_queue_xmit_nit(), input packets are processed by net_bh
 * sequentially, so that if we return skb to original state on exit,
 * we will not harm anyone.
 */
static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
		      struct packet_type *pt, struct net_device *orig_dev)
{
	struct sock *sk;
	struct sockaddr_ll *sll;
	struct packet_sock *po;
	u8 *skb_head = skb->data;
	int skb_len = skb->len;
	unsigned int snaplen, res;
	bool is_drop_n_account = false;

	if (skb->pkt_type == PACKET_LOOPBACK)
		goto drop;

	sk = pt->af_packet_priv;
	po = pkt_sk(sk);

	if (!net_eq(dev_net(dev), sock_net(sk)))
		goto drop;

	skb->dev = dev;

	if (dev_has_header(dev)) {
		/* The device has an explicit notion of ll header,
		 * exported to higher levels.
		 *
		 * Otherwise, the device hides details of its frame
		 * structure, so that corresponding packet head is
		 * never delivered to user.
		 */
		if (sk->sk_type != SOCK_DGRAM)
			skb_push(skb, skb->data - skb_mac_header(skb));
		else if (skb->pkt_type == PACKET_OUTGOING) {
			/* Special case: outgoing packets have ll header at head */
			skb_pull(skb, skb_network_offset(skb));
		}
	}

	snaplen = skb->len;

	// 根据设置的过滤规则过滤
	res = run_filter(skb, sk, snaplen);
	if (!res)
		goto drop_n_restore;
	if (snaplen > res)
		snaplen = res;

	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
		goto drop_n_acct;

	if (skb_shared(skb)) {
		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
		if (nskb == NULL)
			goto drop_n_acct;

		if (skb_head != skb->data) {
			skb->data = skb_head;
			skb->len = skb_len;
		}
		consume_skb(skb);
		skb = nskb;
	}

	sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);

	sll = &PACKET_SKB_CB(skb)->sa.ll;
	sll->sll_hatype = dev->type;
	sll->sll_pkttype = skb->pkt_type;
	if (unlikely(po->origdev))
		sll->sll_ifindex = orig_dev->ifindex;
	else
		sll->sll_ifindex = dev->ifindex;

	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);

	/* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
	 * Use their space for storing the original skb length.
	 */
	PACKET_SKB_CB(skb)->sa.origlen = skb->len;

	if (pskb_trim(skb, snaplen))
		goto drop_n_acct;

	skb_set_owner_r(skb, sk);
	skb->dev = NULL;
	skb_dst_drop(skb);

	/* drop conntrack reference */
	nf_reset_ct(skb);

	// 最后将底层网口符合应用层的数据复制到接收缓存队列中 这样应用层就得到了网络流量
	spin_lock(&sk->sk_receive_queue.lock);
	po->stats.stats1.tp_packets++;
	sock_skb_set_dropcount(sk, skb);
	__skb_queue_tail(&sk->sk_receive_queue, skb);
	spin_unlock(&sk->sk_receive_queue.lock);
	sk->sk_data_ready(sk);
	return 0;

drop_n_acct:
	is_drop_n_account = true;
	atomic_inc(&po->tp_drops);
	atomic_inc(&sk->sk_drops);

drop_n_restore:
	if (skb_head != skb->data && skb_shared(skb)) {
		skb->data = skb_head;
		skb->len = skb_len;
	}
drop:
	if (!is_drop_n_account)
		consume_skb(skb);
	else
		kfree_skb(skb);
	return 0;
}

net/packet/af_packet.c/packet_recvmsg

应用层创建socket,底层会关联上这个socket,一旦关联上链路层抓到的包就会copy一份给上层接口(即PF_PACKET 注册的回调函数packet_rev). 而回调函数会根据应用层设置的bpf过滤数据包,最终放入接收缓存的数据包肯定是符合应用层想截取的数据。因此最后一步recvfrom也就是从接收缓存的数据包copy给应用层

recvfrom的系统调用链如下

__sys_recvfrom->sock_recvmsg->sk.ops.recvmsg

此处的recvmsg就是af_packet.c中绑定的packet_recvmsg

而packet_recvmsg的官方注释是 Pull a packet from our receive queue and hand it to the user.
从接收队列拉取packet传递给用户

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
static const struct proto_ops packet_ops = {
...
	.recvmsg =	packet_recvmsg,
	.mmap =		packet_mmap,
...
};
/*
 *	Pull a packet from our receive queue and hand it to the user.
 *	If necessary we block.
 */

static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
			  int flags)
{
}

recvfrom cap

到这里用户创建socket然后使用recvfrom系统调用抓取网络流量的过程结束

可以看到这个网络抓包流程经历了多次内存拷贝的过程(从链路层copy给上层接口 再从接收队列copy到用户空间)

To do

  • pcap_activate_linux()中混杂模式和"any" device
  • 2020n1ctf W2L kernelpwn

ChangeLog

socket(PF_PACKET, SOCK_RAW/SOCK_DGRAM, htons(ETH_P_ALL))/socket(PF_INET, SOCK_PACKET, htons(ETH_P_ALL))

0.9 live_open_new/live_open_old pcap_read_packet&recvfrom
1.0 activate_new->activate_mmap/active_old pcap_read_linux_mmap/pcap_read_packet&recvfrom
1.9 activate_pf_packet/activate_sock_packet pcap_read_packet/pcap_read_linux_mmap_v1[_64]/2/3 recvfrom
1.10 activate_pf_packet pcap_read_linux_mmap_v2/3

1.9添加DLT_LINUX_SLL2的support,1.10新增DLT_LINUX_SLL2对cooked-mode captures的支持

1.10
Require PF_PACKET support, and kernel 2.6.27 or later
Handle systems without AF_INET or AF_UNIX socket support

可以看到版本更迭差别还是挺大的

Reference

https://mp.weixin.qq.com/s/ZX8Jluh-RgJXcVh3OvycRQ[how tcpdump cap]
https://blog.csdn.net/bie_niu1992/article/details/96435670[tcpdump-libpcap]
https://zh.wikipedia.org/zh-hans/BPF[BPF]
https://www.tcpdump.org/manpages/tcpdump.1.html[tcpdump man]
https://www.compuquip.com/blog/tuning-ring-buffer-checkpoint-firewall[ring buffer]
https://www.cnblogs.com/10087622blog/p/8320234.html[libpcap mmap]
https://stackoverflow.com/questions/60736230/why-libpcap-is-better-than-sniffing-with-raw[libpcap better than raw socket]
https://blog.csdn.net/gengzhikui1992/article/details/103142848[cap speed compare]
http://blog.chinaunix.net/uid-23069658-id-3141409.html
http://blog.chinaunix.net/uid-20357359-id-1963684.html
https://zhuanlan.zhihu.com/p/345901595

Share on

ruokeqx
WRITTEN BY
ruokeqx