從Linux5.9看Icmp的處理流程
本文轉(zhuǎn)載自微信公眾號「編程雜技」,作者theanarkh。轉(zhuǎn)載本文請聯(lián)系編程雜技公眾號。
昨天有個同學(xué)碰到發(fā)送udp包時收到destination unreachable的icmp包問題,本文簡單介紹一下linux5.9中icmp包的處理流程。
發(fā)送icmp包的流程
下面以udp為例看看什么時候會發(fā)送destination unreachable包。我們從收到一個udp包開始分析,具體函數(shù)是udp_rcv。
- int udp_rcv(struct sk_buff *skb){
- return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
- }
- int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
- int proto){
- struct sock *sk;
- struct udphdr *uh;
- unsigned short ulen;
- struct rtable *rt = skb_rtable(skb);
- __be32 saddr, daddr;
- struct net *net = dev_net(skb->dev);
- bool refcounted;
- // udp頭
- uh = udp_hdr(skb);
- ulen = ntohs(uh->len);
- // 源目的ip
- saddr = ip_hdr(skb)->saddr;
- daddr = ip_hdr(skb)->daddr;
- // 頭部指示大小比實際數(shù)據(jù)小
- if (ulen > skb->len)
- goto short_packet;
- if (proto == IPPROTO_UDP) {
- uh = udp_hdr(skb);
- }
- sk = skb_steal_sock(skb, &refcounted);
- // 廣播或多播
- if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
- return __udp4_lib_mcast_deliver(net, skb, uh,
- saddr, daddr, udptable, proto);
- // 單播,根據(jù)地址信息找到對應(yīng)的socket
- sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
- // 找到則掛到socket下
- if (sk)
- return udp_unicast_rcv_skb(sk, skb, uh);
- // 找不到socket則回復(fù)一個ICMP_DEST_UNREACH icmp包
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
- kfree_skb(skb);
- return 0;
- }
我們看到當(dāng)通過ip包信息找不到對應(yīng)socket的時候,就會發(fā)送一個icmp包給發(fā)送端。icmp包結(jié)構(gòu)如下。
收到icmp包的處理流程
我們從收到ip包開始分析。
- int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
- struct net_device *orig_dev){
- struct net *net = dev_net(dev);
- skb = ip_rcv_core(skb, net);
- if (skb == NULL)
- return NET_RX_DROP;
- return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
- net, NULL, skb, dev, NULL,
- ip_rcv_finish);
- }
ip層收到包后會繼續(xù)執(zhí)行ip_rcv_finish。
- static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb){
- struct net_device *dev = skb->dev;
- int ret;
- ret = ip_rcv_finish_core(net, sk, skb, dev, NULL);
- if (ret != NET_RX_DROP)
- ret = dst_input(skb);
- return ret;
- }
接著執(zhí)行dst_input
- static inline int dst_input(struct sk_buff *skb){
- return skb_dst(skb)->input(skb);
- }
input對應(yīng)的是ip_local_deliver。
- int ip_local_deliver(struct sk_buff *skb){
- struct net *net = dev_net(skb->dev);
- return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
- net, NULL, skb, skb->dev, NULL,
- ip_local_deliver_finish);
- }
接著執(zhí)行ip_local_deliver_finish。
- static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb){
- __skb_pull(skb, skb_network_header_len(skb));
- rcu_read_lock();
- ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
- rcu_read_unlock();
- return 0;
- }
ip_local_deliver_finish會執(zhí)行ip_protocol_deliver_rcu進一步處理,ip_protocol_deliver_rcu的最后一個入?yún)⑹莍p包里的協(xié)議字段(上層協(xié)議)。
- void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol){
- const struct net_protocol *ipprot;
- int raw, ret;
- resubmit:
- // 根據(jù)協(xié)議找到對應(yīng)的處理函數(shù),這里是icmp
- ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot) {
- ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
- skb);
- if (ret < 0) {
- protocol = -ret;
- goto resubmit;
- }
- __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
- }
- }
INDIRECT_CALL_2是一個宏。
- #define INDIRECT_CALL_1(f, f1, ...) \
- ({ \
- likely(f == f1) ? f1(__VA_ARGS__) : f(__VA_ARGS__); \
- })#define INDIRECT_CALL_2(f, f2, f1, ...) \
- ({ \
- likely(f == f2) ? f2(__VA_ARGS__) : \
- INDIRECT_CALL_1(f, f1, __VA_ARGS__); \
- })
因為這里的protocol是icmp協(xié)議。所以會執(zhí)行icmp對應(yīng)的handler。那么對應(yīng)的是哪個函數(shù)呢?我們看看inet_protos是什么。
- struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
- int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol){
- return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
- NULL, prot) ? 0 : -1;
- }
我們看到inet_add_protocol函數(shù)是注冊協(xié)議和對應(yīng)處理函數(shù)的。我們再來看看哪里會調(diào)用這個函數(shù)。
- static int __init inet_init(void) {
- inet_add_protocol(&icmp_protocol, IPPROTO_ICMP);
- inet_add_protocol(&udp_protocol, IPPROTO_UDP);
- ...
- }
在內(nèi)核初始化的時候會注冊一系列的協(xié)議和處理函數(shù)。下面我們看看icmp的函數(shù)集。
- static const struct net_protocol icmp_protocol = {
- .handler = icmp_rcv,
- .err_handler = icmp_err,
- .no_policy = 1,
- .netns_ok = 1,
- };
我們看到handler是icmp_rcv。
- int icmp_rcv(struct sk_buff *skb){
- struct icmphdr *icmph;
- struct rtable *rt = skb_rtable(skb);
- struct net *net = dev_net(rt->dst.dev);
- bool success;
- // icmp頭
- icmph = icmp_hdr(skb);
- success = icmp_pointers[icmph->type].handler(skb);
- }
icmp_rcv根據(jù)icmp包的信息做進一步處理。我看看icmp_pointers的定義。
- static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
- ...
- [ICMP_DEST_UNREACH] = {
- .handler = icmp_unreach,
- .error = 1,
- },
- };
這里我們只關(guān)注ICMP_DEST_UNREACH的處理。
- static bool icmp_unreach(struct sk_buff *skb){
- ...
- icmp_socket_deliver(skb, info);
- }
繼續(xù)看icmp_socket_deliver
- static void icmp_socket_deliver(struct sk_buff *skb, u32 info){
- const struct iphdr *iph = (const struct iphdr *) skb->data;
- const struct net_protocol *ipprot;
- int protocol = iph->protocol;
- // 根據(jù)ip頭的協(xié)議字段找到對應(yīng)協(xié)議處理,這里的iph是觸發(fā)錯誤的原始ip頭,不是收到icmp包的ip頭,所以protocol是udp
- ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot && ipprot->err_handler)
- ipprot->err_handler(skb, info);
- }
接著執(zhí)行udp的err_handler,是udp_err
- int udp_err(struct sk_buff *skb, u32 info){
- return __udp4_lib_err(skb, info, &udp_table);}int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable){
- struct inet_sock *inet;
- const struct iphdr *iph = (const struct iphdr *)skb->data;
- struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
- const int type = icmp_hdr(skb)->type;
- const int code = icmp_hdr(skb)->code;
- bool tunnel = false;
- struct sock *sk;
- int harderr;
- int err;
- struct net *net = dev_net(skb->dev);
- // 根據(jù)報文信息找到對應(yīng)socket
- sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
- iph->saddr, uh->source, skb->dev->ifindex,
- inet_sdif(skb), udptable, NULL);
- err = 0;
- harderr = 0;
- inet = inet_sk(sk);
- switch (type) {
- case ICMP_DEST_UNREACH:
- err = EHOSTUNREACH;
- if (code <= NR_ICMP_UNREACH) {
- harderr = icmp_err_convert[code].fatal;
- err = icmp_err_convert[code].errno;
- }
- break;
- ...
- }
- // 設(shè)置錯誤信息到socket
- sk->sk_err = err;
- sk->sk_error_report(sk);
- out:
- return 0;
- }
__udp4_lib_err設(shè)置了錯誤信息,然后調(diào)用sk_error_report。sk_error_report是在調(diào)用socket函數(shù)時賦值的(具體在sock_init_data函數(shù))。
- sk->sk_error_report = sock_def_error_report;
接著看sock_def_error_report
- static void sock_def_error_report(struct sock *sk){
- struct socket_wq *wq;
- rcu_read_lock();
- wq = rcu_dereference(sk->sk_wq);
- if (skwq_has_sleeper(wq))
- wake_up_interruptible_poll(&wq->wait, EPOLLERR);
- sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
- rcu_read_unlock();}static inline void sk_wake_async(const struct sock *sk, int how, int band){
- if (sock_flag(sk, SOCK_FASYNC)) {
- rcu_read_lock();
- sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
- rcu_read_unlock();
- }
- }
我們看到如果進程阻塞在socket則會被喚醒,或者設(shè)置了SOCK_FASYNC標(biāo)記則收到信號。
后記:本文簡單介紹了icmp的產(chǎn)生和處理過程,后面有時間再細化一下。