Linux Network Architecture Network Layer

Post on 12-Jan-2016

107 views 2 download

description

Linux Network Architecture Network Layer. Isaac Y. Tsai . Outline. Network Layer in Linux Network filter and iptable framework PF Ring architecture. Interface between device driver and network layer. Network layer functions. /net/ipv4/ip_input.c - PowerPoint PPT Presentation

Transcript of Linux Network Architecture Network Layer

Linux Network Linux Network ArchitectureArchitecture

Network LayerNetwork Layer

Isaac Y. Tsai <eplusplus@gmail.com>

2010/09/17 © by

Outline

Network Layer in LinuxNetwork filter and iptable frameworkPF Ring architecture

2010/09/17 © by

Interface between device driver and network layer

2010/09/17 © by

Network layer functions<kernel src>/net/ipv4/ip_input.cip_rcv(skb)ip_rcv_finish(skb)ip_local_deliver(skb)ip_local_deliver_finish(skb)

<kernel src>/net/ipv4/ip_forward.cip_forward(skb)ip_forward_finish(skb)

<kernel src>/net/ipv4/ipmr.cint ip_mr_input(skb)

<kernel src>/net/ipv4/ip_output.cip_queue_xmit(skb,ipfragok)ip_local_out(skb)__ip_local_out(skb)ip_output(skb)ip_finish_output(skb)ip_finish_output2(skb)ip_mc_output(skb)

2010/09/17 © by

netif_receive_skb()<kernel src>/net/core/dev.cint netif_receive_skb(struct sk_buff *skb){

struct packet_type *ptype, *pt_prev;struct net_device *orig_dev, *master, *null_or_orig, *null_or_bond;int ret = NET_RX_DROP;__be16 type;if (!skb->tstamp.tv64) net_timestamp(skb);if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))

return NET_RX_SUCCESS;if (netpoll_receive_skb(skb)) return NET_RX_DROP;if (!skb->skb_iif) skb->skb_iif = skb->dev->ifindex;null_or_orig = NULL; orig_dev = skb->dev;master = ACCESS_ONCE(orig_dev->master);

2010/09/17 © by

netif_receive_skb() (cont’ed)if (master) {

if (skb_bond_should_drop(skb, master)) null_or_orig = orig_dev;

else skb->dev = master;}__get_cpu_var(netdev_rx_stat).total++;skb_reset_network_header(skb); skb_reset_transport_header(skb);skb->mac_len = skb->network_header - skb->mac_header; pt_prev = NULL;rcu_read_lock();

#ifdef CONFIG_NET_CLS_ACTif (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); goto ncls; }

#endiflist_for_each_entry_rcu(ptype, &ptype_all, list) {

if (ptype->dev == null_or_orig || ptype->dev == skb->dev || ptype->dev == orig_dev) {

if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev);pt_prev = ptype;

}}

2010/09/17 © by

netif_receive_skb() (cont’ed)#ifdef CONFIG_NET_CLS_ACT

skb = handle_ing(skb, &pt_prev, &ret, orig_dev); if (!skb) goto out;ncls:#endif

skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);if (!skb) goto out;skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);if (!skb) goto out;null_or_bond = NULL;if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) && (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {

null_or_bond = vlan_dev_real_dev(skb->dev);}type = skb->protocol;

2010/09/17 © by

netif_receive_skb() (cont’ed)list_for_each_entry_rcu(ptype,&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {

if (ptype->type == type && (ptype->dev == null_or_orig || ptype->dev == skb->dev || ptype->dev == orig_dev || ptype->dev == null_or_bond)) {

if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev);pt_prev = ptype;

}}if (pt_prev) {

ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);} else {

kfree_skb(skb);ret = NET_RX_DROP;

}out:

rcu_read_unlock();return ret;

}

2010/09/17 © by

net_rx_action()<kernel src>/net/core/dev.c

static void net_rx_action(struct softirq_action *h){

struct list_head *list = &__get_cpu_var(softnet_data).poll_list;unsigned long time_limit = jiffies + 2;int budget = netdev_budget;void *have;local_irq_disable();while (!list_empty(list)) {

struct napi_struct *n;int work, weight;if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))

goto softnet_break;local_irq_enable();n = list_first_entry(list, struct napi_struct, poll_list);

2010/09/17 © by

net_rx_action() (cont’ed)have = netpoll_poll_lock(n);weight = n->weight; work = 0;if (test_bit(NAPI_STATE_SCHED, &n->state)) {

work = n->poll(n, weight);trace_napi_poll(n);

}WARN_ON_ONCE(work > weight);budget -= work;local_irq_disable();if (unlikely(work == weight)) {

if (unlikely(napi_disable_pending(n))) {local_irq_enable();napi_complete(n);local_irq_disable();

} elselist_move_tail(&n->poll_list, list);

}netpoll_poll_unlock(have);

}

2010/09/17 © by

net_rx_action() (cont’ed)out:

local_irq_enable();#ifdef CONFIG_NET_DMA

/* * There may not be any more sk_buffs coming right now, so push * any pending DMA copies to hardware */dma_issue_pending_all();

#endifreturn;

softnet_break:__get_cpu_var(netdev_rx_stat).time_squeeze++;__raise_softirq_irqoff(NET_RX_SOFTIRQ);goto out;

}

2010/09/17 © by

Packet reception path: ip_rcv()

Network layer packet reception code ip_rcv()

ip_rcv() first performs some error checking related to packet type, packet header and it keeps some packet statistics. At the end of the code, it makes a macro function call to NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish);

2010/09/17 © by

ip_rcv()<kernel src>/net/ipv4/ip_input.c

int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)

{struct iphdr *iph;u32 len;if (skb->pkt_type == PACKET_OTHERHOST) goto drop;IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {

IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);goto out;

}if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error;

2010/09/17 © by

ip_rcv() (cont’ed)iph = ip_hdr(skb);if (iph->ihl < 5 || iph->version != 4) goto inhdr_error;if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error;iph = ip_hdr(skb);if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto inhdr_error;len = ntohs(iph->tot_len);if (skb->len < len) {

IP_INC_STATS_BH(dev_net(dev),IPSTATS_MIB_INTRUNCATEDPKTS);goto drop;

} else if (len < (iph->ihl*4)) goto inhdr_error;

2010/09/17 © by

ip_rcv() (cont’ed)if (pskb_trim_rcsum(skb, len)) {

IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);goto drop;

}/* Remove any debris in the socket control block */memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));/* Must drop socket now because of tproxy. */skb_orphan(skb);return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish);

inhdr_error:IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);

drop:kfree_skb(skb);

out:return NET_RX_DROP;

}

2010/09/17 © by

ip_rcv_finish()

ip_rcv_finish() calls ip_route_input() The skb->dst pointer of the socket buffer is set to an entry in the routing cache,

which stores not only the destination on the IP level, but also a pointer to an entry in the hard header cache (cache for layer-2 frame packet headers), if present. If ip_route_input() cannot find a route, then the packet is discarded. Finally in ip_rcv_finish(), the procedure of the IP protocol reaches the junction between packets addressed to the local computer and packets to be forwarded. The information about the further path of an IP packet is stored in the routing entry skb->dst. Notice that a trick often used in the Linux kernel is used here. If a switch (variable value) is used to select different functions, then we simply insert a pointer to each of these functions. This saves us an if or switch instruction for each decision of how the program should continue. In the example used here, the pointer skb->dst->input() points to the function that should be used to handle a packet further:

2010/09/17 © by

The pointer skb->dst->input() points to the function that should be used to handle a packet further:

ip_local_deliver() is entered in the case of unicast and multicast packets that should be delivered to the local computer.

ip_forward() handles all unicast packets that should be forwarded.ip_mr_input() is used for multicast packets that should be forwarded.

2010/09/17 © by

ip_rcv_finish(skb)<kernel src>/net/ipv4/ip_input.c

static int ip_rcv_finish(struct sk_buff *skb){

const struct iphdr *iph = ip_hdr(skb);struct rtable *rt;if (skb_dst(skb) == NULL) {

int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, skb->dev);if (unlikely(err)) {

if (err == -EHOSTUNREACH)IP_INC_STATS_BH(dev_net(skb->dev), IPSTATS_MIB_INADDRERROR

S);else if (err == -ENETUNREACH)

IP_INC_STATS_BH(dev_net(skb->dev), IPSTATS_MIB_INNOROUTES);goto drop;

}}

2010/09/17 © by

ip_rcv_finish(skb) (cont’ed)#ifdef CONFIG_NET_CLS_ROUTE

if (unlikely(skb_dst(skb)->tclassid)) { struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id()); u32 idx = skb_dst(skb)->tclassid; st[idx&0xFF].o_packets++;

st[idx&0xFF].o_bytes += skb->len; st[(idx>>16)&0xFF].i_packets++; st[(idx>>16)&0xFF].i_bytes += skb->len;

}#endif

if (iph->ihl > 5 && ip_rcv_options(skb)) goto drop;rt = skb_rtable(skb);if (rt->rt_type == RTN_MULTICAST) {

IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST, skb->len);} else if (rt->rt_type == RTN_BROADCAST)

IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST, skb->len);return dst_input(skb);

drop:kfree_skb(skb); return NET_RX_DROP;

}

2010/09/17 © by

ip_local_deliver(skb)

<kernel src>/net/ipv4/ip_input.c/* Deliver IP Packets to the higher protocol layers. */int ip_local_deliver(struct sk_buff *skb){

/* Reassemble IP fragments. */if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {

if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))return 0;

}return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL,

ip_local_deliver_finish);}

2010/09/17 © by

ip_local_deliver_finish(skb)<kernel src>/net/ipv4/ip_input.c

static int ip_local_deliver_finish(struct sk_buff *skb){

struct net *net = dev_net(skb->dev);__skb_pull(skb, ip_hdrlen(skb));

/* Point into the IP datagram, just past the header. */skb_reset_transport_header(skb);rcu_read_lock();{

int protocol = ip_hdr(skb)->protocol;int hash, raw;const struct net_protocol *ipprot;

resubmit:raw = raw_local_deliver(skb, protocol);hash = protocol & (MAX_INET_PROTOS - 1);ipprot = rcu_dereference(inet_protos[hash]);

2010/09/17 © by

ip_local_deliver_finish(skb) (cont’ed)if (ipprot != NULL) {

int ret; if (!net_eq(net, &init_net) && !ipprot->netns_ok) { if (net_ratelimit())printk("%s: proto %d isn't netns-ready\n", __func__, protocol); kfree_skb(skb); goto out; } if (!ipprot->no_policy) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {

kfree_skb(skb); goto out; } nf_reset(skb); } ret = ipprot->handler(skb); if (ret < 0) { protocol = -ret; goto resubmit; } IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);} else {

2010/09/17 © by

ip_local_deliver_finish(skb) (cont’ed)

if (!raw) { if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {

IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } } else IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); kfree_skb(skb);

}}

out:rcu_read_unlock();return 0;

}

2010/09/17 © by

dst_input(skb)

<net/dst.h>static inline int dst_input(struct sk_buff *skb) {

return skb_dst(skb)->input(skb);}

<linux/skbuff.h>static inline struct dst_entry *skb_dst(const struct sk_buff *skb){

return (struct dst_entry *)skb->_skb_dst;}static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst){

skb->_skb_dst = (unsigned long)dst;}

2010/09/17 © by

dst_output(skb)<net/dst.h>/* Output packet to network from transport. */static inline int dst_output(struct sk_buff *skb){

return skb_dst(skb)->output(skb);}

2010/09/17 © by

struct dst_entry<net/dst.h>struct dst_entry {

struct rcu_head rcu_head;struct dst_entry *child;struct net_device *dev;short error, obsolete;int flags;unsigned long expires;unsigned short header_len, trailer_len; /* space to reserve at tail */unsigned int rate_tokens;unsigned long rate_last; /* rate limiting for ICMP */struct dst_entry *path;struct neighbour *neighbour;struct hh_cache *hh;

#ifdef CONFIG_XFRMstruct xfrm_state *xfrm;

#else

2010/09/17 © by

struct dst_entry (cont’ed)void *__pad1;

#endifint (*input)(struct sk_buff*);int (*output)(struct sk_buff*);struct dst_ops *ops;u32 metrics[RTAX_MAX];

#ifdef CONFIG_NET_CLS_ROUTE__u32 tclassid;

#else__u32 __pad2;

#endif/* Align __refcnt to a 64 bytes alignment */

#ifdef CONFIG_64BITlong __pad_to_align_refcnt[1];

#endif

2010/09/17 © by

struct dst_entry (cont’ed)/* * __refcnt wants to be on a different cache line from * input/output/ops or performance tanks badly */atomic_t __refcnt; /* client references */int __use;unsigned long lastuse;union {

struct dst_entry *next;struct rtable *rt_next;struct rt6_info *rt6_next;struct dn_route *dn_next;

};};

2010/09/17 © by

ip_forward(skb)The primary task of ip_forward(skb) is to process a few conditions of the Interne

t Protocol (e.g., a packet's lifetime) and packet options. First, packets not marked with pkt_type == PACKET_HOST are deleted. Next, the reach of the packet is checked. If the value in its TTL field is 1 (before it is decremented), then the packet is deleted. RFC 791 specifies that, if such an action occurs, an ICMP packet has to be returned to the sender to inform the latter (ICMP_TIME_EXCEEDED).

Once a redirect message has been checked, if applicable, the socket buffer is checked to see if there is sufficient memory for the headroom. This means that the function skb_cow(skb, headroom) is used to check whether there is still sufficient space for the MAC header in the output network device (out_dev->hard_header_len). If this is not the case, then skb_realloc_headroom() creates sufficient space. Subsequently, the TTL field of the IP packet is decremented by one.

When the actual packet length (including the MAC header) is known, it is checked for whether it really fits into the frame format of the new output network device. If it is too long (skb->len > mtu), and if no fragmenting is allowed because the Don't-Fragment bit is set in the IP header, then the packet is discarded, and the ICMP message ICMP_FRAG_NEEDED is transmitted to the sender. In any case, the packet is not fragmented yet; fragmenting is delayed. The early test for such cases prevents potential Don't-Fragment candidates from running through the entire IP protocol-handling process, only to be dropped eventually.

2010/09/17 © by

ip_forward(skb)<kernel src>/net/ipv4/ip_forward.cint ip_forward(struct sk_buff *skb){

struct iphdr *iph;/* Our header */struct rtable *rt; /* Route we use */struct ip_options * opt = &(IPCB(skb)->opt);if (skb_warn_if_lro(skb)) goto drop;if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))

goto drop;if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))

return NET_RX_SUCCESS;if (skb->pkt_type != PACKET_HOST)

goto drop;skb_forward_csum(skb);/* According to the RFC, we must first decrease the TTL field. If

that reaches zero, we must reply an ICMP control message telling that the packet's lifetime expired. */if (ip_hdr(skb)->ttl <= 1) goto too_many_hops;if (!xfrm4_route_forward(skb)) goto drop;

2010/09/17 © by

ip_forward(skb) (cont’ed)rt = skb_rtable(skb);if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto sr_failed;if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) &&

(ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {IP_INC_STATS(dev_net(rt->u.dst.dev), IPSTATS_MIB_FRAGFAILS);icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,

htonl(dst_mtu(&rt->u.dst)));goto drop;

}/* We are about to mangle packet. Copy it! */if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))

goto drop;iph = ip_hdr(skb); /* Decrease ttl after skb cow done */ip_decrease_ttl(iph);

/* now generate an ICMP HOST REDIRECT giving the route calculated. */if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb))

ip_rt_send_redirect(skb);

2010/09/17 © by

ip_forward(skb) (cont’ed)skb->priority = rt_tos2priority(iph->tos);

return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev, ip_forward_finish);

sr_failed:/* Strict routing permits no gatewaying */ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0); goto drop;

too_many_hops:/* Tell the sender its packet died... */IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);

drop:kfree_skb(skb);return NET_RX_DROP;

}

2010/09/17 © by

ip_forward_finish(skb)<kernel src>/net/ipv4/ip_forward.c

static int ip_forward_finish(struct sk_buff *skb){

struct ip_options * opt = &(IPCB(skb)->opt);IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);if (unlikely(opt->optlen)) ip_forward_options(skb);return dst_output(skb);

}

ip_forward_finish(). This function has actually very little functionality (unless FASTROUTE is enabled). Once the IP options, if used, have been processed in ip_forward_options(), the ip_send() function is invoked to check on whether the packet has to be fragmented and to eventually do a fragmentation, if applicable.

2010/09/17 © by

ip_forward_options(skb)<kernel src>/net/ipv4/ip_forward.cvoid ip_forward_options(struct sk_buff *skb){

struct ip_options * opt = &(IPCB(skb)->opt);unsigned char * optptr;struct rtable *rt = skb_rtable(skb);unsigned char *raw = skb_network_header(skb);if (opt->rr_needaddr) {

optptr = (unsigned char *)raw + opt->rr;ip_rt_get_source(&optptr[optptr[2]-5], rt); opt->is_changed = 1;

}if (opt->srr_is_hit) {

int srrptr, srrspace; optptr = raw + opt->srr; for ( srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4 ) {

if (srrptr + 3 > srrspace) break;if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0) break;

}

2010/09/17 © by

ip_forward_options(skb) (cont’ed)

if (srrptr + 3 <= srrspace) {opt->is_changed = 1;ip_rt_get_source(&optptr[srrptr-1], rt);ip_hdr(skb)->daddr = rt->rt_dst;optptr[2] = srrptr+4;} else if (net_ratelimit())printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");if (opt->ts_needaddr) {optptr = raw + opt->ts;ip_rt_get_source(&optptr[optptr[2]-9], rt);opt->is_changed = 1;}

}if (opt->is_changed) {

opt->is_changed = 0;ip_send_check(ip_hdr(skb));

}}

2010/09/17 © by

ip_send_check(iph)

<kernel src>/net/ipv4/ip_output.c

/* Generate a checksum for an outgoing IP datagram. */__inline__ void ip_send_check(struct iphdr *iph){ iph->check = 0; iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);}

2010/09/17 © by

ip_queue_xmit(skb, ipfragok)<kernel src>/net/ipv4/ip_output.c

int ip_queue_xmit(struct sk_buff *skb, int ipfragok){

struct sock *sk = skb->sk;struct inet_sock *inet = inet_sk(sk);struct ip_options *opt = inet->opt;struct rtable *rt;struct iphdr *iph;rt = skb_rtable(skb);if (rt != NULL) goto packet_routed;/* Make sure we can route this packet. */rt = (struct rtable *)__sk_dst_check(sk, 0);if (rt == NULL) {

__be32 daddr;/* Use correct destination address if we have options. */daddr = inet->inet_daddr;if(opt && opt->srr) daddr = opt->faddr;

2010/09/17 © by

ip_queue_xmit(skb, ipfragok) (cont’ed)

{ struct flowi fl = { .oif = sk->sk_bound_dev_if,

.mark = sk->sk_mark,

.nl_u = { .ip4_u = { .daddr = daddr,.saddr = inet->inet_saddr,.tos = RT_CONN_FLAGS(sk) } },

.proto = sk->sk_protocol,

.flags = inet_sk_flowi_flags(sk),

.uli_u = { .ports = { .sport = inet->inet_sport, .dport = inet->inet_dport } } };

security_sk_classify_flow(sk, &fl);if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))

goto no_route;}

2010/09/17 © by

ip_queue_xmit(skb, ipfragok) (cont’ed)

sk_setup_caps(sk, &rt->u.dst);}skb_dst_set(skb, dst_clone(&rt->u.dst));

packet_routed:if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)

goto no_route;skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));skb_reset_network_header(skb);iph = ip_hdr(skb);*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)

iph->frag_off = htons(IP_DF);else

iph->frag_off = 0;iph->ttl = ip_select_ttl(inet, &rt->u.dst);iph->protocol = sk->sk_protocol;iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst;

2010/09/17 © by

ip_queue_xmit(skb, ipfragok) (cont’ed)

if (opt && opt->optlen) {iph->ihl += opt->optlen >> 2;ip_options_build(skb, opt, inet->inet_daddr, rt, 0);

}ip_select_ident_more(iph, &rt->u.dst, sk,

(skb_shinfo(skb)->gso_segs ?: 1) - 1);skb->priority = sk->sk_priority;skb->mark = sk->sk_mark;return ip_local_out(skb);

no_route:IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);kfree_skb(skb);return -EHOSTUNREACH;

}

2010/09/17 © by

ip_local_out(skb)

<kernel src>/net/ipv4/ip_output.c

int ip_local_out(struct sk_buff *skb){

int err;err = __ip_local_out(skb);if (likely(err == 1)) err = dst_output(skb);return err;

}EXPORT_SYMBOL_GPL(ip_local_out);

2010/09/17 © by

__ip_local_out(skb)<kernel src>/net/ipv4/ip_output.c

int __ip_local_out(struct sk_buff *skb){

struct iphdr *iph = ip_hdr(skb);iph->tot_len = htons(skb->len);ip_send_check(iph);return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb,

NULL, skb_dst(skb)->dev, dst_output);}

2010/09/17 © by

ip_output(skb)<kernel src>/net/ipv4/ip_output.c

int ip_output(struct sk_buff *skb){

struct net_device *dev = skb_dst(skb)->dev;IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);skb->dev = dev;skb->protocol = htons(ETH_P_IP);return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev, ip_finish_output,!(IPCB(skb)->flags & IPSKB_REROUTED));

}

2010/09/17 © by

ip_finish_output(skb)<kernel src>/net/ipv4/ip_output.c

static int ip_finish_output(struct sk_buff *skb){#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)

/* Policy lookup after SNAT yielded a new policy */if (skb_dst(skb)->xfrm != NULL) {

IPCB(skb)->flags |= IPSKB_REROUTED;return dst_output(skb);

}#endif

if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))return ip_fragment(skb, ip_finish_output2);

elsereturn ip_finish_output2(skb);

}

2010/09/17 © by

ip_finish_output2(skb)<kernel src>/net/ipv4/ip_output.c

static inline int ip_finish_output2(struct sk_buff *skb){

struct dst_entry *dst = skb_dst(skb);struct rtable *rt = (struct rtable *)dst;struct net_device *dev = dst->dev;unsigned int hh_len = LL_RESERVED_SPACE(dev);if (rt->rt_type == RTN_MULTICAST) {

IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);} else if (rt->rt_type == RTN_BROADCAST)

IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);/* Be paranoid, rather than too clever. */if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {

struct sk_buff *skb2;skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));

2010/09/17 © by

ip_finish_output2(skb) (cont’ed)

if (skb2 == NULL) {kfree_skb(skb);return -ENOMEM;

}if (skb->sk)

skb_set_owner_w(skb2, skb->sk);kfree_skb(skb);skb = skb2;

}if (dst->hh) return neigh_hh_output(dst->hh, skb);else if (dst->neighbour) return dst->neighbour->output(skb);if (net_ratelimit())

printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");kfree_skb(skb);return -EINVAL;

}

2010/09/17 © by

Netfilter hooks for connection tracking

2010/09/17 © by

HF_HOOK()<linux/netfilter.h>static inline intNF_HOOK(uint8_t pf, unsigned int hook, struct sk_buff *skb,

struct net_device *in, struct net_device *out,int (*okfn)(struct sk_buff *))

{return NF_HOOK_THRESH(pf, hook, skb, in, out, okfn, INT_MIN);

}static inline intNF_HOOK_THRESH(uint8_t pf, unsigned int hook, struct sk_buff *skb,

struct net_device *in, struct net_device *out, int (*okfn)(struct sk_buff *), int thresh)

{int ret = nf_hook_thresh(pf, hook, skb, in, out, okfn, thresh);if (ret == 1) ret = okfn(skb);return ret;

}

2010/09/17 © by

Arguments of NF_HOOK macro

pf (protocol family): This is the identifier of the protocol family: PF_INET for IP Version 4, PF_INET6 for IP Version 6.

hook: This is the hook identifier. All valid identifiers for each protocol family are defined in a header file (e.g., <linux/netfilter_ipv4.h>).

skb: This is a pointer to the sk_buff structure with the packet to be handled.indev (input device): This is a pointer to the net_device structure of the net

work device that received the packet. It is set to NULL in the above example, because the packet is an outgoing packet.

outdev (output device): This is a pointer to the net_device structure of the network device that should be used by the packet to leave the local computer. In the above example, the device used has to be determined first by use of the routing table (rt).

okfn() (okay function): This function is invoked when all filter functions registered with this hook returned NF_ACCEPT, thereby okaying the packet's transit.

2010/09/17 © by

nf_hook()<linux/netfilter.h>static inline int nf_hook(u_int8_t pf, unsigned int hook,

struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *))

{return nf_hook_thresh(pf, hook, skb, indev, outdev, okfn, INT_MIN);

}

static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), int thresh)

{#ifndef CONFIG_NETFILTER_DEBUG

if (list_empty(&nf_hooks[pf][hook]))return 1;

#endifreturn nf_hook_slow(pf, hook, skb, indev, outdev, okfn, thresh);

}

2010/09/17 © by

nf_hook_thresh()<linux/netfilter.h>/** nf_hook_thresh - call a netfilter hook Returns 1 if the hook has allowed the packet to pass. The function

okfn must be invoked by the caller in this case. Any other return value indicates the packet has been consumed by the hook.

*/static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), int thresh){#ifndef CONFIG_NETFILTER_DEBUG

if (list_empty(&nf_hooks[pf][hook])) return 1;#endif

return nf_hook_slow(pf, hook, skb, indev, outdev, okfn, thresh);}

2010/09/17 © by

nf_hook_slow()<kernel src>/net/netfilter/core.c

/* Returns 1 if okfn() needs to be executed by the caller, * -EPERM for NF_DROP, 0 otherwise. */

int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), int hook_thresh){

struct list_head *elem;unsigned int verdict; int ret = 0;rcu_read_lock();elem = &nf_hooks[pf][hook];

next_hook:verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev,

outdev, &elem, okfn, hook_thresh);

2010/09/17 © by

nf_hook_slow() (cont’ed)

if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1;} else if (verdict == NF_DROP) { kfree_skb(skb); ret = -EPERM;} else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn, verdict >> NF_VERDICT_BITS))

goto next_hook;}rcu_read_unlock(); return ret;

}EXPORT_SYMBOL(nf_hook_slow);

2010/09/17 © by

nf_hook_slow()<kernel src>/net/netfilter/core.c/* Returns 1 if okfn() needs to be executed by the caller, -EPERM for NF_DROP, 0 otherwise. */int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev,struct net_device *outdev, int (*okfn)(struct sk_buff *), int hook_thresh){

struct list_head *elem; unsigned int verdict; int ret = 0;rcu_read_lock(); /* We may already have this, but read-locks nest anyway */elem = &nf_hooks[pf][hook];

next_hook:verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev, outdev, &elem, okfn, hook_thresh);if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1;} else if (verdict == NF_DROP) {

kfree_skb(skb); ret = -EPERM;} else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn,verdict >> NF_VERDICT_BITS))

goto next_hook;}rcu_read_unlock(); return ret;

}EXPORT_SYMBOL(nf_hook_slow);

2010/09/17 © by

nf_iterate()<kernel src>/net/netfilter/core.c

unsigned int nf_iterate(struct list_head *head,struct sk_buff *skb, unsigned int hook,const struct net_device *indev, const struct net_device *outdev,struct list_head **i, int (*okfn)(struct sk_buff *), int hook_thresh)

{unsigned int verdict;

/* The caller must not block between calls to this function because of risk of continuing from deleted element. */list_for_each_continue_rcu(*i, head) {

struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;if (hook_thresh > elem->priority) continue;

/* Optimization: we don't need to hold module reference here, since function can't sleep. --RR */

verdict = elem->hook(hook, skb, indev, outdev, okfn);if (verdict != NF_ACCEPT) {

2010/09/17 © by

nf_iterate() (cont’ed)

#ifdef CONFIG_NETFILTER_DEBUG if (unlikely((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT)) { NFDEBUG("Evil return from %p(%u).\n", elem->hook, hook); continue; }

#endif if (verdict != NF_REPEAT) return verdict;

*i = (*i)->prev;}

}return NF_ACCEPT;

}

2010/09/17 © by

Netfilter hook identifier<linux/netfilter_ipv4.h>

NF_IP_PRE_ROUTING (0): Incoming packets pass this hook in the ip_rcv() function before they are processed by the routing code. Prior to that, only a few simple consistency checks with regard to the version, length, and checksum fields in the IP header are done. Meaningful opportunities to use this hook result whenever incoming packets should be caught before they are processed—for example, to detect certain types of denial-of-service attacks that operate on poorly built IP packets, or for address-translation mechanisms (NAT), or for accounting functions (counting of incoming packets).

NF_IP_LOCAL_IN (1): All incoming packets addressed to the local computer

pass this hook in the function ip_local_deliver(). At this point, the iptables module hooks the INPUT rules list into place to filter incoming data packets. This corresponds to the input rules list in ipchains.

2010/09/17 © by

Netfilter hook identifier (cont’ed)

NF_IP_FORWARD (2): All incoming packets not addressed to the local computer pass this hook in the function ip_forward()—that is, packets to be forwarded and leaving the computer over a different network interface.This includes any packet the address of which was modified by NAT. At this point, the iptables module hooks the FORWARD rules list into place to filter forwarded data packets. This corresponds to the forward rules list in ipchains.

NF_IP_LOCAL_OUT (3): All outgoing packets created in the local computer pass this hook in the function ip_build_and_send_pkt(). At this point, the iptables module hooks the OUTPUT rules list into place to filter outgoing data packets. This corresponds to the output rules list in ipchains.

NF_IP_POST_ROUTING (4): This hook in the ip_finish_output() function represents the last chance to access all outgoing (forwarded or locally created) packets before they leave the computer over a network device. Like the NF_IP_PRE_ROUTING hook, this is a good place to integrate accounting functions.

2010/09/17 © by

nf_hookfnThe packet-filter functions that are actually hooked into the netfilter hooks are so-called hook functions of the type nf_hookfn. The parameters (except for the protocol family identifier) correspond exactly to those of the NF_HOOK macro

<linux/netfilter.h>typedef unsigned int nf_hookfn(unsigned int hooknum,

struct sk_buff *skb, const struct net_device *in,const struct net_device *out, int (*okfn)(struct sk_buff *));

2010/09/17 © by

Return value of a packet-filter function

The return value of a packet-filter function specifies what should happen to the packet. These are defined in <linux/netfilter.h>.

NF_DROP (0): The active rules list processing is stopped, and the packet is dropped.

NF_ACCEPT (1): The packet is passed to the next packet filter function in the rules list. Once the end of the list has been reached, the packet is released by okfn() for further processing.

NF_STOLEN (2): The packet filter function withholds the packet for further processing, so that the active rules list processing is stopped. In contrast to NF_DROP, however, the packet does not have to be explicitly dropped.

NF_QUEUE (3): The function nf_queue() (net/core/netfilter.c) puts the packet in a queue from which it can be removed and processed (e.g., by a user space program). Subsequently, nf_reinject() has to be invoked to return the packet to the Linux kernel for further processing by netfilter.

NF_REPEAT (4): In contrast to NF_ACCEPT, rather than a continuation of processing at the next packet-filter function, the current filter function is invoked again.

2010/09/17 © by

nf_register_hook(), nf_unregister_hook()

nf_register_hook(), nf_unregister_hook() registers and unregisters a packet-filter function with the Linux kernel. The parameter passed is a nf_hook_ops structure, which includes all information required.

<linux/netfilter.h>

struct nf_hook_ops {struct list_head list;nf_hookfn *hook;struct module *owner;u_int8_t pf;unsigned int hooknum; /* Hooks are ordered in ascending priority. */int priority;

};

2010/09/17 © by

struct nf_hook_opslist: The nf_hook_ops structures are maintained in a linked list within the Linux

kernel.hook(): This is a pointer to the actual packet-filter function of the type nf_hookf

n.pf, hooknum: The protocol family identifier (e.g., PF_INET or PF_INET6) and the

hook identifier (e.g., NF_IP_INPUT) are used to determine the hook for this packet-filter function.

priority: Packet-filter functions within the rules list of a hook are sorted by the priority field in ascending order, so that they will be invoked in this order when a packet transits. Priority values are defined as follows, e.g., in <linux/netfilter_ipv4.h>:

enum nf_ip_hook_priorities { NF_IP_PRI_FIRST = INT_MIN, NF_IP_PRI_CONNTRACK = -200, NF_IP_PRI_MANGLE = -150, NF_IP_PRI_NAT_DST = -100, NF_IP_PRI_FILTER = 0, NF_IP_PRI_NAT_src = 100, NF_IP_PRI_LAST = INT_MAX,};

2010/09/17 © by

First netfilter example module/* Sample code to install a Netfilter hook function that will * drop all incoming packets. */#define __KERNEL__#define MODULE#include <linux/kernel.h>#include <linux/module.h>#include <linux/netfilter.h>#include <linux/netfilter_ipv4.h>#include <linux/skbuff.h>

static struct nf_hook_ops nfho;unsigned int my_hookfunc(unsigned int hooknum, struct sk_buff **skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { return NF_DROP; /* Drop ALL packets */ }

2010/09/17 © by

First netfilter example module (cont’ed)

static int __init init_module(void) { /* Fill in our hook structure */ nfho.hook = my_hookfunc; /* Handler function */

nfho.hooknum = NF_IP_PRE_ROUTING; /* First hook for IPv4 */ nfho.pf = PF_INET; nfho.priority = NF_IP_PRI_FIRST; /* Make our function first */

nf_register_hook(&nfho); return 0; }static void __exit cleanup_module(void) { nf_unregister_hook(&nfho); }module_init(init_module);module_exit(cleanup_module);

2010/09/17 © by

Second netfilter example module//For any packet, get the ip header and check the protocol field//if the protocol number equal to UDP (17), log in var/log/messages//default action of module to let all packets through #include <linux/kernel.h>#include <linux/module.h>#include <linux/netfilter.h>#include <linux/netfilter_ipv4.h>#include <linux/skbuff.h>#include <linux/udp.h>#include <linux/ip.h>static struct nf_hook_ops nfho; //net filter hook option structstruct sk_buff *sock_buff;struct udphdr *udp_header; struct iphdr *ip_header; //ip header struct

2010/09/17 © by

Second netfilter example module (cont’ed)

unsigned int my_hookfunc(unsigned int hooknum, struct sk_buff **skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *))

{sock_buff = *skb;

ip_header = (struct iphdr *)skb_network_header(sock_buff); if(!sock_buff) { return NF_ACCEPT; }

if (ip_header->protocol==17) { udp_header = (struct udphdr *)skb_transport_header(sock_buff); printk(KERN_INFO "got udp packet \n"); //log to /var/log/messages return NF_DROP; } return NF_ACCEPT; }

2010/09/17 © by

Second netfilter example module (cont’ed)

static int __init init_module(void) { nfho.hook = my_hookfunc; nfho.hooknum = NF_IP_PRE_ROUTING; nfho.pf = PF_INET; nfho.priority = NF_IP_PRI_FIRST; nf_register_hook(&nfho); return 0; } static void __exit cleanup_module(void) { nf_unregister_hook(&nfho); }module_init(init_module);module_exit(cleanup_module);

2010/09/17 © by

Third netfilter example module

/* Sample code to install a Netfilter hook function that will drop all incoming packets from an IP address we specify */#define __KERNEL__#define MODULE#include <linux/kernel.h>#include <linux/module.h>#include <linux/netfilter.h>#include <linux/netfilter_ipv4.h>#include <linux/skbuff.h>#include <linux/udp.h>#include <linux/ip.h>

/* The structure used to register filter function */static struct nf_hook_ops nfho;/* IP address we want to drop packets from, in network byte order */static unsigned char *drop_ip = "x7fx00x00x01"; /* 127.0.0.1 */

2010/09/17 © by

Third netfilter example module (cont’ed)

/* This is the hook function itself */unsigned int my_hookfunc(unsigned int hooknum, struct sk_buff **skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { struct sk_buff *sb = *skb; if (sb->nh.iph->saddr == *(unsigned int *)drop_ip) { printk("Dropped packet from... %d.%d.%d.%dn", drop_ip, *(drop_ip + 1), *(drop_ip + 2), *(drop_ip + 3)); return NF_DROP; } else { return NF_ACCEPT; } }

2010/09/17 © by

Third netfilter example module (cont’ed)

static int __init init_module(void) { nfho.hook = my_hookfunc; nfho.hooknum = NF_IP_PRE_ROUTING; /* First hook for IPv4 */ nfho.pf = PF_INET; nfho.priority = NF_IP_PRI_FIRST; /* Make our function first */ nf_register_hook(&nfho); return 0;}/* Cleanup routine */static void __exit cleanup_module(void){ nf_unregister_hook(&nfho);}module_init(init_module);module_exit(cleanup_module);

2010/09/17 © by

The module interface of the connection-tracking module is located in the file net/ipv4/netfilter/ip_conntrack_standalone.c. The file net/ipv4/netfilter/ip_conntrack_core.c contains the actual connection-tracking functionality. The connection-tracking module hooks itself into the netfilter hooks NF_IP_PRE_ROUTING and NF_IP_LOCAL_OUT with very high priority (the NF_IP_PRI_CONNTRACK is set to -200 in <linux/netfilter_ipv4.h>).

<linux/netfilter_ipv4.h>enum nf_ip_hook_priorities {

NF_IP_PRI_FIRST = INT_MIN,NF_IP_PRI_CONNTRACK_DEFRAG = -400, NF_IP_PRI_RAW = -300,NF_IP_PRI_SELINUX_FIRST = -225, NF_IP_PRI_CONNTRACK = -200,NF_IP_PRI_MANGLE = -150, NF_IP_PRI_NAT_DST = -100, NF_IP_PRI_FILTER = 0, NF_IP_PRI_SECURITY = 50,NF_IP_PRI_NAT_SRC = 100, NF_IP_PRI_SELINUX_LAST = 225,NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX,NF_IP_PRI_LAST = INT_MAX, };

2010/09/17 © by

ip_route_input()<kernel src>/net/ipv4/route.c

int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev)

{struct rtable * rth;unsigned hash;int iif = dev->ifindex;struct net *net;net = dev_net(dev);if (!rt_caching(net)) goto skip_cache;tos &= IPTOS_RT_MASK;hash = rt_hash(daddr, saddr, iif, rt_genid(net));rcu_read_lock();

2010/09/17 © by

ip_route_input() (cont’ed) for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;

rth = rcu_dereference(rth->u.dst.rt_next)) { if (((rth->fl.fl4_dst ^ daddr) | (rth->fl.fl4_src ^ saddr) | (rth->fl.iif ^ iif) | rth->fl.oif | (rth->fl.fl4_tos ^ tos)) == 0 && rth->fl.mark == skb->mark && net_eq(dev_net(rth->u.dst.dev), net) && !rt_is_expired(rth)) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); skb_dst_set(skb, &rth->u.dst); return 0; } RT_CACHE_STAT_INC(in_hlist_search);

} rcu_read_unlock();

2010/09/17 © by

ip_route_input() (cont’ed)skip_cache:

/* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing hardware multicast filters :-( As result the host on multicasting network acquires a lot of useless route cache entries, sort of SDR messages from all the world. Now we try to get rid of them. Really, provided software IP multicast filter is organized reasonably (at least, hashed), it does not result in a slowdown comparing with route cache reject entries. Note, that multicast routers are not affected, because route cache entry is created eventually. */if (ipv4_is_multicast(daddr)) {

struct in_device *in_dev;rcu_read_lock();

2010/09/17 © by

ip_route_input() (cont’ed)if ((in_dev = __in_dev_get_rcu(dev)) != NULL) { int our = ip_check_mc(in_dev, daddr, saddr, ip_hdr(skb)->protocol); if (our

#ifdef CONFIG_IP_MROUTE||

(!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))

#endif ) {

rcu_read_unlock();return ip_route_input_mc(skb, daddr, saddr, tos, dev, our);

}}rcu_read_unlock(); return -EINVAL;

} return ip_route_input_slow(skb, daddr, saddr, tos, dev);}

2010/09/17 © by

ip_route_input_slow()<kernel src>/net/ipv4/route.cstatic int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,

u8 tos, struct net_device *dev){

struct fib_result res;struct in_device *in_dev = in_dev_get(dev);struct flowi fl = { .nl_u = { .ip4_u =

{ .daddr = daddr, .saddr = saddr,.tos = tos, .scope = RT_SCOPE_UNIVERSE,

} }, .mark = skb->mark, .iif = dev->ifindex };

unsigned flags = 0; u32 itag = 0;struct rtable * rth;unsigned hash;__be32 spec_dst;int err = -EINVAL, free_res = 0;

2010/09/17 © by

ip_route_input_slow() (cont’ed)struct net * net = dev_net(dev);

/* IP on this device is disabled. */if (!in_dev) goto out;

/*Check for the most weird martians, which can be not detected by fib_lookup. */if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || ipv4_is_loopback(saddr)) goto martian_source;if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))

goto brd_input;/* Accept zero addresses only to limited broadcast;

I even do not know to fix it or not. Waiting for complains :-) */if (ipv4_is_zeronet(saddr)) goto martian_source;if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))

goto martian_destination;/* Now we are ready to route packet. */if ((err = fib_lookup(net, &fl, &res)) != 0) {

if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach;goto no_route;

}

2010/09/17 © by

ip_route_input_slow() (cont’ed)

free_res = 1;RT_CACHE_STAT_INC(in_slow_tot);if (res.type == RTN_BROADCAST) goto brd_input;if (res.type == RTN_LOCAL) {

int result;result = fib_validate_source(saddr, daddr, tos,

net->loopback_dev->ifindex, dev, &spec_dst, &itag, skb->mark);

if (result < 0) goto martian_source;if (result) flags |= RTCF_DIRECTSRC;spec_dst = daddr;goto local_input;

}if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach;if (res.type != RTN_UNICAST) goto martian_destination;err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);

2010/09/17 © by

ip_route_input_slow() (cont’ed)

done:in_dev_put(in_dev);if (free_res) fib_res_put(&res);

out: return err;brd_input:

if (skb->protocol != htons(ETH_P_IP)) goto e_inval;if (ipv4_is_zeronet(saddr))

spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);else {

err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag, skb->mark);

if (err < 0) goto martian_source;if (err) flags |= RTCF_DIRECTSRC;

}flags |= RTCF_BROADCAST;res.type = RTN_BROADCAST;RT_CACHE_STAT_INC(in_brd);

2010/09/17 © by

ip_route_input_slow() (cont’ed)

local_input:rth = dst_alloc(&ipv4_dst_ops);if (!rth) goto e_nobufs;rth->u.dst.output= ip_rt_bug;rth->u.dst.obsolete = -1;rth->rt_genid = rt_genid(net);atomic_set(&rth->u.dst.__refcnt, 1);rth->u.dst.flags= DST_HOST;if (IN_DEV_CONF_GET(in_dev, NOPOLICY))

rth->u.dst.flags |= DST_NOPOLICY;rth->fl.fl4_dst = daddr;rth->rt_dst = daddr;rth->fl.fl4_tos = tos;rth->fl.mark = skb->mark;rth->fl.fl4_src = saddr;rth->rt_src = saddr;

#ifdef CONFIG_NET_CLS_ROUTErth->u.dst.tclassid = itag;

#endif

2010/09/17 © by

ip_route_input_slow() (cont’ed)

rth->rt_iif =rth->fl.iif = dev->ifindex;rth->u.dst.dev = net->loopback_dev;dev_hold(rth->u.dst.dev);rth->idev = in_dev_get(rth->u.dst.dev);rth->rt_gateway = daddr;rth->rt_spec_dst= spec_dst;rth->u.dst.input= ip_local_deliver;rth->rt_flags = flags|RTCF_LOCAL;if (res.type == RTN_UNREACHABLE) {

rth->u.dst.input= ip_error;rth->u.dst.error= -err;rth->rt_flags &= ~RTCF_LOCAL;

}rth->rt_type = res.type;hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);goto done;

2010/09/17 © by

ip_route_input_slow() (cont’ed)

no_route:RT_CACHE_STAT_INC(in_no_route);spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);res.type = RTN_UNREACHABLE;if (err == -ESRCH) err = -ENETUNREACH;goto local_input;

/* Do not cache martian addresses: they should be logged (RFC1812) */martian_destination:

RT_CACHE_STAT_INC(in_martian_dst);#ifdef CONFIG_IP_ROUTE_VERBOSE

if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",

&daddr, &saddr, dev->name);#endife_hostunreach:

err = -EHOSTUNREACH;goto done;

2010/09/17 © by

ip_route_input_slow() (cont’ed)

e_inval:err = -EINVAL;goto done;

e_nobufs:err = -ENOBUFS;goto done;

martian_source:ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);goto e_inval;

}

2010/09/17 © by

ip_handle_martian_source()static void ip_handle_martian_source(struct net_device *dev, struct in_device *in_dev, struct sk_buff *skb, __be32 daddr, __be32 saddr) {

RT_CACHE_STAT_INC(in_martian_src);#ifdef CONFIG_IP_ROUTE_VERBOSE

if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {/* RFC1812, if source is martian, the only hint is MAC header*/

printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",&daddr, &saddr, dev->name);

if (dev->hard_header_len && skb_mac_header_was_set(skb)) { int i; const unsigned char *p = skb_mac_header(skb); printk(KERN_WARNING "ll header: "); for (i = 0; i < dev->hard_header_len; i++, p++) { printk("%02x", *p); if (i < (dev->hard_header_len - 1)) printk(":"); } printk("\n");}

}#endif}

2010/09/17 © by

PF_RING architecture

2010/09/17 © by

PF_RINGPF_RING is a new type of socket based interface. It includes

three software modules. A kernel module called ‘PF_RING’ which is written as a new socket protocol type and handles all the socket buffers in both packet reception and transmission. A user space library ‘libpfring’ is used to facilitate user applications access the underlying socket based ring buffer management scheme. The third part of the software modules is a set of example user applications that demonstrated how to use PF_RING.

Kernel module called PF_RING (pf_ring.h and pf_ring.c)User library libpfring.a or libpfring.o (pfring.h and pfring.c)Example user application programs such as pfcount.c

2010/09/17 © by

Some pf_ring.c global variablesstatic struct proto ring_proto;

static struct list_head ring_table;static u_int ring_table_size;static struct list_head ring_cluster_list;/* List of all devices on which PF_RING has been registered */static struct list_head ring_aware_device_list;/* List of all dna (direct nic access) devices */static struct list_head ring_dna_devices_list;static u_int dna_devices_list_size = 0;/* pf_ring.h #define MAX_NUM_DEVICES 256 */static struct list_head device_ring_list[MAX_NUM_DEVICES];static struct net_proto_family ring_family_ops = { .family = PF_RING, .create = ring_create, .owner = THIS_MODULE,};/* Dummy 'any' device */static struct net_device any_dev, none_dev;

2010/09/17 © by

struct proto<net/sock.h>/* Networking protocol blocks attached to sockets. socket layer -> transport laye

r interface transport -> network interface is defined by struct inet_proto */struct proto {

void (*close)(struct sock *sk, long timeout);int (*connect)(struct sock *sk,struct sockaddr *uaddr, int addr_len);int (*disconnect)(struct sock *sk, int flags);struct sock * (*accept) (struct sock *sk, int flags, int *err);int (*ioctl)(struct sock *sk, int cmd, unsigned long arg);int (*init)(struct sock *sk);void (*destroy)(struct sock *sk);void (*shutdown)(struct sock *sk, int how);int (*setsockopt)(struct sock *sk, int level, int optname,

char __user *optval, unsigned int optlen);int (*getsockopt)(struct sock *sk, int level,

int optname, char __user *optval, int __user *option);

2010/09/17 © by

struct proto (cont’ed)#ifdef CONFIG_COMPAT

int (*compat_setsockopt)(struct sock *sk, int level,int optname, char __user *optval, unsigned int optlen);

int (*compat_getsockopt)(struct sock *sk, int level,int optname, char __user *optval, int __user *option);

#endifint (*sendmsg)(struct kiocb *iocb, struct sock *sk,

struct msghdr *msg, size_t len);int (*recvmsg)(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,

size_t len, int noblock, int flags, int *addr_len);int (*sendpage)(struct sock *sk, struct page *page,

int offset, size_t size, int flags);int (*bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len);int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb);/* Keeping track of sk's, looking them up, and port selection methods. */void (*hash)(struct sock *sk);void (*unhash)(struct sock *sk);int (*get_port)(struct sock *sk, unsigned short snum);

2010/09/17 © by

struct proto (cont’ed)#ifdef CONFIG_PROC_FS

unsigned int inuse_idx;#endif

void (*enter_memory_pressure)(struct sock *sk);atomic_t *memory_allocated;/* Current allocated memory. */struct percpu_counter *sockets_allocated;/* Current num of sockets. */int *memory_pressure;int *sysctl_mem;int *sysctl_wmem;int *sysctl_rmem;int max_header;struct kmem_cache *slab;unsigned int obj_size;int slab_flags;struct percpu_counter *orphan_count;struct request_sock_ops *rsk_prot;struct timewait_sock_ops *twsk_prot;

2010/09/17 © by

struct proto (cont’ed)

union {struct inet_hashinfo *hashinfo;struct udp_table *udp_table;struct raw_hashinfo *raw_hash;

} h;struct module *owner;char name[32];struct list_head node;

#ifdef SOCK_REFCNT_DEBUGatomic_t socks;

#endif};

2010/09/17 © by

struct ring_opt/* Ring options */struct ring_opt { u_int8_t ring_active, num_rx_channels; struct net_device *ring_netdev; u_short ring_pid; u_int32_t ring_id; char *appl_name; /* String that id the application bound to the socket */ packet_direction direction; /* Specify the capture direction for packets */ struct ring_opt *master_ring; /* Master Ring */ u_int8_t mmap_count; dna_device *dna_device; /* Direct NIC Access */ u_short cluster_id; /* Cluster, 0 = no cluster */ int32_t channel_id; /* Channel, -1 = any channel */ struct net_device *reflector_dev; /* Reflector device */ unsigned long order; /* Packet buffers */ void *ring_memory; /* Ring Slots */

2010/09/17 © by

struct ring_opt (cont’ed) u_int32_t bucket_len; FlowSlotInfo *slots_info; /* Points to ring_memory */ char *ring_slots; /* Points to ring_memory+sizeof(FlowSlotInfo) */ u_int32_t pktToSample, sample_rate; /* Packet Sampling */ struct sk_filter *bpfFilter; /* BPF Filter */ filtering_hash_bucket **filtering_hash; /* Filtering Rules */ u_int16_t num_filtering_rules; u_int8_t rules_default_accept_policy; /*1=default is accept,drop otherwise */ struct list_head rules; atomic_t num_ring_users;/* Locks */ wait_queue_head_t ring_slots_waitqueue; rwlock_t ring_index_lock, ring_rules_lock; u_int insert_page_id, insert_slot_id;/* Indexes (Internal) */ do_handle_filtering_hash_bucket handle_hash_rule;/* Function pointer */};

2010/09/17 © by

struct pfring_hooks/* Hack to jump from a device directly to PF_RING */

struct pfring_hooks { u_int32_t magic; /* Should be set to PF_RING and be the first one */ unsigned int *transparent_mode; handle_ring_skb ring_handler; handle_ring_buffer buffer_ring_handler; handle_add_hdr_to_ring buffer_add_hdr_to_ring; register_pfring_plugin pfring_registration; unregister_pfring_plugin pfring_unregistration; handle_ring_dna_device ring_dna_device_handler; read_device_pfring_free_slots pfring_free_device_slots;};

2010/09/17 © by

Global variable ring_hooks/* pf_ring.h */#define PF_RING 27 /* Packet Ring */#define SOCK_RING PF_RING

/* pf_ring.c */static struct pfring_hooks ring_hooks = { .magic = PF_RING, .transparent_mode = &transparent_mode, .ring_handler = skb_ring_handler, .buffer_ring_handler = buffer_ring_handler, .buffer_add_hdr_to_ring = add_hdr_to_ring, .pfring_registration = register_plugin, .pfring_unregistration = unregister_plugin, .ring_dna_device_handler = dna_device_handler,};

2010/09/17 © by

ring_init()kernel/pf_ring.cstatic int __init ring_init(void){ int i, rc; if((rc = proto_register(&ring_proto, 0)) != 0) return(rc); INIT_LIST_HEAD(&ring_table); INIT_LIST_HEAD(&ring_cluster_list); INIT_LIST_HEAD(&ring_aware_device_list); INIT_LIST_HEAD(&ring_dna_devices_list); for (i = 0; i < MAX_NUM_DEVICES; i++) INIT_LIST_HEAD(&device_ring_list[i]); memset(&any_dev, 0, sizeof(any_dev)); strcpy(any_dev.name, "any"); memset(&none_dev, 0, sizeof(none_dev)); strcpy(none_dev.name, "none"); ring_proc_init(); sock_register(&ring_family_ops); register_netdevice_notifier(&ring_netdev_notifier); /* Sanity check */ if(transparent_mode > driver2pf_ring_non_transparent) transparent_mode = standard_linux_path;

2010/09/17 © by

ring_init() (cont’ed)

printk("[PF_RING] Ring slots %d\n", num_slots); printk("[PF_RING] Slot version %d\n", RING_FLOWSLOT_VERSION); printk("[PF_RING] Capture TX %s\n", enable_tx_capture ? "Yes [RX+TX]" :

"No [RX only]"); printk("[PF_RING] Transparent Mode %d\n", transparent_mode); printk("[PF_RING] IP Defragment %s\n", enable_ip_defrag ? "Yes" : "No"); printk("[PF_RING] Initialized correctly\n"); register_device_handler(); pfring_enabled = 1; return 0;}

2010/09/17 © by

ring_proc_init()static void ring_proc_init(void){ ring_proc_dir = proc_mkdir("pf_ring",#if(LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24))

init_net.#endif

proc_net); if(ring_proc_dir) {#if(LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30)) ring_proc_dir->owner = THIS_MODULE;#endif ring_proc_dev_dir = proc_mkdir(PROC_DEV, ring_proc_dir); ring_proc = create_proc_read_entry(PROC_INFO, 0,

ring_proc_dir, ring_proc_get_info, NULL);

2010/09/17 © by

ring_proc_init() (cont’ed) ring_proc_plugins_info = create_proc_read_entry(PROC_PLUGINS_INFO, 0, ring_proc_dir,

ring_proc_get_plugin_info, NULL); if(!ring_proc || !ring_proc_plugins_info) printk("[PF_RING] unable to register proc file\n"); else {#if(LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30)) ring_proc->owner = THIS_MODULE; ring_proc_plugins_info->owner = THIS_MODULE;#endif printk("[PF_RING] registered /proc/net/pf_ring/\n"); } } else printk("[PF_RING] unable to create /proc/net/pf_ring\n");}

2010/09/17 © by

INIT_LIST_HEAD()

<linux/list.h>

struct list_head {struct list_head *next, *prev;

};static inline void INIT_LIST_HEAD(struct list_head *list) { list->next = list; list->prev = list;}

2010/09/17 © by

register_netdevice_notifier(nb)<linux/netdevice.h>

<kernel src>/net/core/dev.cint register_netdevice_notifier(struct notifier_block *nb){

struct net_device *dev; struct net_device *last;struct net *net; int err;rtnl_lock();err = raw_notifier_chain_register(&netdev_chain, nb);if (err) goto unlock;if (dev_boot_phase) goto unlock;for_each_net(net) {

for_each_netdev(net, dev) {err = nb->notifier_call(nb, NETDEV_REGISTER, dev);err = notifier_to_errno(err);if (err) goto rollback;if (!(dev->flags & IFF_UP)) continue;nb->notifier_call(nb, NETDEV_UP, dev);

}}

2010/09/17 © by

register_netdevice_notifier(nb) (cont’ed)

unlock:rtnl_unlock(); return err;

rollback:last = dev;for_each_net(net) { for_each_netdev(net, dev) {

if (dev == last) break;if (dev->flags & IFF_UP) { nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); nb->notifier_call(nb, NETDEV_DOWN, dev);}nb->notifier_call(nb, NETDEV_UNREGISTER, dev);nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);

}}raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock;

}EXPORT_SYMBOL(register_netdevice_notifier);

2010/09/17 © by

dev_add_pack(pt)<linux/netdevice.h><kernel src>/net/core/dev.c

void dev_add_pack(struct packet_type *pt){

int hash;spin_lock_bh(&ptype_lock);if (pt->type == htons(ETH_P_ALL))

list_add_rcu(&pt->list, &ptype_all);else {

hash = ntohs(pt->type) & PTYPE_HASH_MASK;list_add_rcu(&pt->list, &ptype_base[hash]);

}spin_unlock_bh(&ptype_lock);

}EXPORT_SYMBOL(dev_add_pack);

2010/09/17 © by

ring_notifier()static struct notifier_block ring_netdev_notifier = { .notifier_call = ring_notifier,};

static int ring_notifier(struct notifier_block *this, unsigned long msg, void *data)

{ struct net_device *dev = data; struct pfring_hooks *hook; switch(msg) { case NETDEV_UP: break; case NETDEV_DOWN: break; case NETDEV_REGISTER:#ifdef RING_DEBUG printk("[PF_RING] packet_notifier(%s) [REGISTER][pfring_ptr=%p]\n",

dev->name, dev->pfring_ptr);#endif

2010/09/17 © by

ring_notifier() (cont’ed) if(dev->pfring_ptr == NULL) { dev->pfring_ptr = &ring_hooks; add_device_to_ring_list(dev); } break; case NETDEV_UNREGISTER:#ifdef RING_DEBUG printk("[PF_RING] packet_notifier(%s) [UNREGISTER][pfring_ptr=%p]\n",

dev->name, dev->pfring_ptr);#endif hook = (struct pfring_hooks*)dev->pfring_ptr; if(hook->magic == PF_RING) { remove_device_from_ring_list(dev); dev->pfring_ptr = NULL; } break; case NETDEV_CHANGE: /* Interface state change */ case NETDEV_CHANGEADDR: break;

2010/09/17 © by

ring_notifier() (cont’ed)case NETDEV_CHANGENAME: /* Rename interface ethX -> ethY */ { struct list_head *ptr, *tmp_ptr;#if defined(RING_DEBUG) printk("[PF_RING] device change name %s\n", dev->name);#endif list_for_each_safe(ptr, tmp_ptr, &ring_aware_device_list) {

ring_device_element *dev_ptr = list_entry(ptr, ring_device_element, list); if(dev_ptr->dev == dev) {

#if defined(RING_DEBUG) printk("[PF_RING] ==>> FOUND device change name %s\n", dev->name);#endif

dev_ptr->proc_entry->name = dev->name; break; } } } break;

default: printk("[PF_RING] packet_notifier(%s): unhandled message [msg=%lu][pfring_ptr=%p]\n", dev->name, msg, dev->pfring_ptr); break; } return NOTIFY_DONE;}

2010/09/17 © by

proto_register()<net/sock.h><kernel src>/net/core/sock.c

int proto_register(struct proto *prot, int alloc_slab){

if (alloc_slab) {prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,

SLAB_HWCACHE_ALIGN | prot->slab_flags, NULL); if (prot->slab == NULL) {printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", prot->name);

goto out; } if (prot->rsk_prot != NULL) {

prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);

2010/09/17 © by

proto_register() (cont’ed) if (prot->rsk_prot->slab_name == NULL) goto out_free_sock_slab;

prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,

prot->rsk_prot->obj_size, 0,SLAB_HWCACHE_ALIGN, NULL);

if (prot->rsk_prot->slab == NULL) {printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",

prot->name);goto out_free_request_sock_slab_name;

}}if (prot->twsk_prot != NULL) {

prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, “tw_sock_%s", prot->nam

e);

2010/09/17 © by

proto_register() (cont’ed)if (prot->twsk_prot->twsk_slab_name == NULL)

goto out_free_request_sock_slab;prot->twsk_prot->twsk_slab =

kmem_cache_create(prot->twsk_prot->twsk_slab_name,prot->twsk_prot->twsk_obj_size, 0,SLAB_HWCACHE_ALIGN | prot->slab_flags, NULL);

if (prot->twsk_prot->twsk_slab == NULL)goto out_free_timewait_sock_slab_name;

}}write_lock(&proto_list_lock);list_add(&prot->node, &proto_list);assign_proto_idx(prot);write_unlock(&proto_list_lock);return 0;

out_free_timewait_sock_slab_name:kfree(prot->twsk_prot->twsk_slab_name);

2010/09/17 © by

proto_register() (cont’ed)out_free_request_sock_slab:

if (prot->rsk_prot && prot->rsk_prot->slab) {kmem_cache_destroy(prot->rsk_prot->slab);prot->rsk_prot->slab = NULL;

}out_free_request_sock_slab_name:

if (prot->rsk_prot) kfree(prot->rsk_prot->slab_name);out_free_sock_slab:

kmem_cache_destroy(prot->slab); prot->slab = NULL;out:

return -ENOBUFS;}EXPORT_SYMBOL(proto_register);

2010/09/17 © by

sock_register()<linux/net.h><kernel src>/net/socket.c/*sock_register - add a socket protocol handler */int sock_register(const struct net_proto_family *ops){

int err;if (ops->family >= NPROTO) {

printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);return -ENOBUFS;

}spin_lock(&net_family_lock);if (net_families[ops->family]) err = -EEXIST;else {

net_families[ops->family] = ops; err = 0;}spin_unlock(&net_family_lock);

printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);return err;

}

2010/09/17 © by

register_device_handler(void)

/* Protocol hook */static struct packet_type prot_hook;

void register_device_handler(void) {if(transparent_mode != standard_linux_path) return; prot_hook.func = packet_rcv;prot_hook.type = htons(ETH_P_ALL);dev_add_pack(&prot_hook);

}

2010/09/17 © by

sk_alloc()<net/sock.h><kernel src>/net/sock.c/** sk_alloc - All socket objects are allocated here * @net: the applicable net namespace @family: protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @prot: struct proto associated with this new sock instance */struct sock *sk_alloc(struct net *net, int family, gfp_t priority,

struct proto *prot){

struct sock *sk;sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);if (sk) {

sk->sk_family = family;sk->sk_prot = sk->sk_prot_creator = prot;sock_lock_init(sk); sock_net_set(sk, get_net(net));atomic_set(&sk->sk_wmem_alloc, 1);

}return sk;

}EXPORT_SYMBOL(sk_alloc);

2010/09/17 © by

ring_create()static int ring_create(struct net *net,struct socket *sock, int protocol, int kern){ struct sock *sk; struct ring_opt *pfr; int err;#if defined(RING_DEBUG) printk("[PF_RING] ring_create()\n");#endif if(!capable(CAP_NET_ADMIN)) return -EPERM; if(sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if(protocol != htons(ETH_P_ALL)) return -EPROTONOSUPPORT; err = -ENOMEM;#if(LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11)) sk = sk_alloc(PF_RING, GFP_KERNEL, 1, NULL);#else#if(LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24))sk = sk_alloc(PF_RING, GFP_ATOMIC, &ring_proto, 1);#else sk = sk_alloc(net, PF_INET, GFP_KERNEL, &ring_proto);#endif#endif

2010/09/17 © by

ring_create() (cont’ed) if(sk == NULL) goto out; sock->ops = &ring_ops; sock_init_data(sock, sk);#if(LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11)) sk_set_owner(sk, THIS_MODULE);#endif err = -ENOMEM; ring_sk(sk) = ring_sk_datatype(kmalloc(sizeof(*pfr), GFP_KERNEL)); if(!(pfr = ring_sk(sk))) { sk_free(sk); goto out; } memset(pfr, 0, sizeof(*pfr)); pfr->ring_active = 0; /* Activate as soon as somebody waits for pakts */ pfr->num_rx_channels = UNKNOWN_NUM_RX_CHANNELS; pfr->channel_id = RING_ANY_CHANNEL; pfr->bucket_len = DEFAULT_BUCKET_LEN; pfr->handle_hash_rule = handle_filtering_hash_bucket;

2010/09/17 © by

ring_create() (cont’ed) init_waitqueue_head(&pfr->ring_slots_waitqueue); rwlock_init(&pfr->ring_index_lock); rwlock_init(&pfr->ring_rules_lock); atomic_set(&pfr->num_ring_users, 0); INIT_LIST_HEAD(&pfr->rules); sk->sk_family = PF_RING; sk->sk_destruct = ring_sock_destruct; ring_insert(sk); pfr->master_ring = NULL; pfr->ring_netdev = &none_dev; /* Unbound socket */ pfr->sample_rate = 1; /* No sampling */ pfr->ring_pid = current->pid; pfr->ring_id = ring_id_serial++; ring_proc_add(pfr);#if defined(RING_DEBUG) printk("[PF_RING] ring_create(): created\n");#endif return(0); out: return err;}

2010/09/17 © by

packet_rcv(skb,dev,pt,orig_dev)

static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)

{ int rc; if(skb->pkt_type != PACKET_LOOPBACK) { rc = skb_ring_handler(skb,

(skb->pkt_type == PACKET_OUTGOING) ? 0 : 1,1, UNKNOWN_RX_CHANNEL, UNKNOWN_NUM_RX_CHANNELS);

} else rc = 0; kfree_skb(skb); return(rc);}

2010/09/17 © by

skb_ring_handler()static int skb_ring_handler(struct sk_buff *skb,

u_char recv_packet, u_char real_skb /* 1=real skb, 0=faked skb */ , u_int8_t channel_id, u_int8_t num_rx_channels)

{ struct sock *skElement; int rc = 0, is_ip_pkt, displ; struct list_head *ptr; struct pfring_pkthdr hdr; struct sk_buff *skk = NULL, *orig_skb = skb;

2010/09/17 © by

skb_ring_handler() (cont’ed) if((!skb) ||((!enable_tx_capture) && (!recv_packet))) { /*An outgoing packet is about to be sent out but we decided not to handle transm

itted packets. */ return(0); } if(recv_packet) { /* Hack for identifying a packet received by the e1000 */ if(real_skb) displ = SKB_DISPLACEMENT; else displ = 0; /* Received by the e1000 wrapper */ } else displ = 0; is_ip_pkt = parse_pkt(skb, displ, &hdr);

2010/09/17 © by

skb_ring_handler() (cont’ed)if(enable_ip_defrag && real_skb && is_ip_pkt && recv_packet && (ring_table_size > 0)) { struct sk_buff *cloned = NULL; struct iphdr *iphdr = NULL; skb_reset_network_header(skb); skb_reset_transport_header(skb); skb_set_network_header(skb, ETH_HLEN - displ); iphdr = ip_hdr(skb); if(iphdr) { if(iphdr->frag_off & htons(IP_MF | IP_OFFSET)) {

if((cloned = skb_clone(skb, GFP_ATOMIC)) != NULL) { skk = ring_gather_frags(cloned); if(skk != NULL) { skb = skk; parse_pkt(skb, displ, &hdr); hdr.len = hdr.caplen = skb->len + displ; } else { return(0); /* mask rcvd fragments */ } }

} } }

2010/09/17 © by

skb_ring_handler() (cont’ed) if(skb->tstamp.tv64 == 0) __net_timestamp(skb); hdr.ts = ktime_to_timeval(skb->tstamp); hdr.len = hdr.caplen = skb->len + displ; /* Avoid the ring to be manipulated while playing with it */ read_lock_bh(&ring_mgmt_lock); /* [1] Check unclustered sockets */ list_for_each(ptr, &ring_table) { struct ring_opt *pfr; struct ring_element *entry; entry = list_entry(ptr, struct ring_element, list); skElement = entry->sk; pfr = ring_sk(skElement);

2010/09/17 © by

skb_ring_handler() (cont’ed) if( (pfr != NULL) && (pfr->ring_netdev != &none_dev)

&& (pfr->cluster_id == 0 )&& (pfr->ring_slots != NULL) && is_valid_skb_direction(pfr->direction, recv_packet)

&& ((pfr->ring_netdev == skb->dev) || (pfr->ring_netdev == &any_dev) /* Socket bound to 'any' */ || ((skb->dev->flags & IFF_SLAVE)

&& (pfr->ring_netdev == skb->dev->master)))) { /* We've found the ring where the packet can be stored */ int old_caplen = hdr.caplen; /* Keep old lenght */ hdr.caplen = min(hdr.caplen, pfr->bucket_len); add_skb_to_ring(skb, pfr, &hdr, is_ip_pkt, displ, channel_id, num_rx_channels); hdr.caplen = old_caplen; rc = 1; /* Ring found: we've done our job */ } }

2010/09/17 © by

skb_ring_handler() (cont’ed) /* [2] Check socket clusters */ list_for_each(ptr, &ring_cluster_list) { ring_cluster_element *cluster_ptr; struct ring_opt *pfr; cluster_ptr = list_entry(ptr, ring_cluster_element, list); if(cluster_ptr->cluster.num_cluster_elements > 0) { u_int skb_hash = hash_pkt_cluster(cluster_ptr, &hdr); u_short num_iterations; for(num_iterations = 0;

num_iterations < cluster_ptr->cluster.num_cluster_elements; num_iterations++) { skElement = cluster_ptr->cluster.sk[skb_hash]; if(skElement != NULL) { pfr = ring_sk(skElement);

2010/09/17 © by

skb_ring_handler() (cont’ed) if((pfr != NULL) && (pfr->ring_slots != NULL) && ((pfr->ring_netdev == skb->dev)

|| ((skb->dev->flags & IFF_SLAVE) && (pfr->ring_netdev == skb->dev->master)))

&& is_valid_skb_direction(pfr->direction, recv_packet) ) { FlowSlot *theSlot = get_insert_slot(pfr); if((theSlot == NULL) || (theSlot->slot_state == 0 /* Not full */)) { /* We've found the ring where the packet can be stored */ add_skb_to_ring(skb, pfr, &hdr, is_ip_pkt, displ,

channel_id, num_rx_channels); rc = 1; /* Ring found: we've done our job */ break; } }}

2010/09/17 © by

skb_ring_handler() (cont’ed)if(cluster_ptr->cluster.hashing_mode != cluster_round_robin) break;else

skb_hash = (skb_hash + 1) % cluster_ptr>cluster.num_cluster_elements; } } } /* Clustering */ read_unlock_bh(&ring_mgmt_lock);/* Fragment handling */ if(skk != NULL) kfree_skb(skk); if(rc == 1) { if(transparent_mode != driver2pf_ring_non_transparent) { rc = 0; } else { if(recv_packet && real_skb) {

kfree_skb(orig_skb); } } }return(rc);/* 0 = packet not handled */}

2010/09/17 © by

User space libraryAll begins with ‘pfring_’ prefix. A struct pfring in user space keeps all the needed information using the under

lying PF_RING module.

int pfring_recv(pfring *ring, char* buffer, u_int buffer_len, struct pfring_pkthdr *hdr, u_int8_t wait_for_incoming_packet);

pfring* pfring_open(char *device_name, u_int8_t promisc, u_int32_t caplen, u_int8_t reentrant);

int pfring_bind(pfring *ring, char *device_name);

int pfring_read(pfring *ring, char* buffer, u_int buffer_len, struct pfring_pkthdr *hdr, u_int8_t wait_for_incoming_packet,

u_int8_t consume_packet_immediately);

2010/09/17 © by

struct pfringtypedef struct { /* DNA (Direct NIC Access) */

u_char dna_mapped_device; u_int32_t tot_dna_read_pkts, rx_reg; dna_device dna_dev; u_int32_t *rx_reg_ptr[MAX_NUM_RX_CHANNELS]; /* All devices */char *buffer, *slots, *device_name;int fd;FlowSlotInfo *slots_info; FlowSlot *last_slot_to_update;u_int page_id, slot_id, pkts_per_page;u_int poll_sleep; u_int8_t clear_promisc, reentrant; u_long num_poll_calls;pthread_spinlock_t spinlock;

} pfring;

2010/09/17 © by

pfring_open()pfring* pfring_open(char *device_name, u_int8_t promisc,

u_int32_t caplen, u_int8_t _reentrant) { int err = 0; pfring *ring = (pfring*)malloc(sizeof(pfring)); if(ring == NULL) return(NULL); else memset(ring, 0, sizeof(pfring)); ring->reentrant = _reentrant; ring->fd = socket(PF_RING, SOCK_RAW, htons(ETH_P_ALL)); if(ring->fd > 0) { int rc; u_int memSlotsLen; if(caplen > MAX_CAPLEN) caplen = MAX_CAPLEN; setsockopt(ring->fd, 0, SO_RING_BUCKET_LEN, &caplen, sizeof(caplen)); if((device_name == NULL) || (strcmp(device_name, "none") == 0)) { rc = 0; /* No binding yet */ } else rc = pfring_bind(ring, device_name);

2010/09/17 © by

pfring_open() (cont’ed) if(rc == 0) { ring->buffer = (char *)mmap(NULL, PAGE_SIZE, PROT_READ|PROT_WRITE,

MAP_SHARED, ring->fd, 0); if(ring->buffer == MAP_FAILED) {

printf("mmap() failed");free(ring); return(NULL); } ring->slots_info = (FlowSlotInfo *)ring->buffer; if(ring->slots_info->version != RING_FLOWSLOT_VERSION) {printf("Wrong RING version: kernel is %i, libpfring was compiled with %i\n",

ring->slots_info->version, RING_FLOWSLOT_VERSION); free(ring); return(NULL);

} memSlotsLen = ring->slots_info->tot_mem; munmap(ring->buffer, PAGE_SIZE); ring->buffer = (char *)mmap(NULL, memSlotsLen, PROT_READ|PROT_WRITE,

MAP_SHARED, ring->fd, 0); if(ring->buffer == MAP_FAILED) {

printf("mmap() failed"); free(ring); return(NULL); }

2010/09/17 © by

pfring_open() (cont’ed)ring->slots_info = (FlowSlotInfo *)ring->buffer;ring->slots = (char *)(ring->buffer+sizeof(FlowSlotInfo));if(ring->slots_info->remove_idx >= ring->slots_info->tot_slots)

ring->slots_info->remove_idx = 0;ring->page_id = PAGE_SIZE, ring->slot_id = 0, ring->pkts_per_page = 0;ring->device_name = strdup(device_name);

if(promisc) { if(set_if_promisc(device_name, 1) == 0) ring->clear_promisc = 1; } } else { close(ring->fd); err = -1; } } else { err = -1; free(ring); } if(err == 0) { if(ring->reentrant) pthread_spin_init(&ring->spinlock, PTHREAD_PROCESS_PRIVATE); return(ring); } else return(NULL);}

2010/09/17 © by

pfring_bind()int pfring_bind(pfring *ring, char *device_name) { struct sockaddr sa; char *at; int32_t channel_id = -1; int rc = 0; if((device_name==NULL) || (strcmp(device_name, "none") ==0)) return(-1); at = strchr(device_name, '@'); if(at != NULL) { char *tok, *pos = NULL; at[0] = '\0';/* Syntax : ethX@1,5 channel 1 and 5, ethX@1-5 channel 1,2...5, ethX@1-3,5-7 channel 1,2,3,5,6,7 */ tok = strtok_r(&at[1], ",", &pos); channel_id = 0;

2010/09/17 © by

pfring_bind() (cont’ed) while(tok != NULL) { char *dash = strchr(tok, '-'); int32_t min_val, max_val, i; if(dash) {

dash[0] = '\0'; min_val = atoi(tok); max_val = atoi(&dash[1]); } else min_val = max_val = atoi(tok); for(i = min_val; i <= max_val; i++) channel_id |= 1 << i; tok = strtok_r(NULL, ",", &pos); } } sa.sa_family = PF_RING; snprintf(sa.sa_data, sizeof(sa.sa_data), "%s", device_name); rc = bind(ring->fd, (struct sockaddr *)&sa, sizeof(sa)); if(rc == 0) { if(channel_id != -1) { int rc = pfring_set_channel_id(ring, channel_id); if(rc != 0) printf("pfring_set_channel_id() failed: %d\n", rc); } } return(rc);}

2010/09/17 © by

pfring_recv()pfring_recv() is just a wrapper of the pfring_read() function

int pfring_recv(pfring *ring, char* buffer, u_int buffer_len,struct pfring_pkthdr *hdr, u_int8_t wait_for_incoming_packet)

{ return(pfring_read(ring, buffer, buffer_len,

hdr, wait_for_incoming_packet, 1));}

2010/09/17 © by

pfring_read()int pfring_read(pfring *ring, char* buffer, u_int buffer_len, struct pfring_pkthdr *hdr,

u_int8_t wait_for_incoming_packet,u_int8_t consume_packet_immediately) { if(ring == NULL) return(-1); if(ring->reentrant) { /* Late packet consumers is not supported in multithreaded env. as threads can st

eal each other's packets */ consume_packet_immediately = 1; } if(ring->dna_mapped_device) { char *pkt = NULL; if(wait_for_incoming_packet) { if(ring->reentrant) pthread_spin_lock(&ring->spinlock); switch(ring->dna_dev.device_model) { case intel_e1000:

e1000_there_is_a_packet_to_read(ring, wait_for_incoming_packet); break;

default: return(0); } if(ring->reentrant) pthread_spin_unlock(&ring->spinlock); }

2010/09/17 © by

pfring_read() (cont’ed) switch(ring->dna_dev.device_model) { case intel_e1000: pkt=get_next_e1000_packet(ring,buffer,buffer_len,hdr); break; case intel_igb: pkt = NULL, hdr->len = 0; break; case intel_ixgbe: pkt = NULL, hdr->len = 0; break; } if(pkt && (hdr->len > 0)) { /* Set the (1) below to (0) for enabling packet parsing for DNA devices */ if(1) hdr->parsed_header_len = 0; else parse_pkt(buffer, hdr); return(1); } else return(0); } else { FlowSlot *slot; u_int32_t queuedPkts;#ifdef USE_ADAPTIVE_WAIT u_int32_t num_loops = 0;#endif if((ring == NULL) || (ring->buffer == NULL)) return(-1); if(ring->last_slot_to_update) pfring_notify(ring, REFLECT_PACKET_DEVICE_NONE);

2010/09/17 © by

pfring_read() (cont’ed) do_pfring_recv: if(ring->reentrant) pthread_spin_lock(&ring->spinlock); slot = (FlowSlot*)&ring->slots[ring->slots_info->remove_idx*ring->slots_info->sl

ot_len]; if(ring->slots_info->tot_insert >= ring->slots_info->tot_read) queuedPkts = ring->slots_info->tot_insert - ring->slots_info->tot_read; else queuedPkts = ring->slots_info->tot_slots + ring->slots_info->tot_insert - ring->sl

ots_info->tot_read; if(queuedPkts && (slot->slot_state == 1 /* There's a packet to read */)) { char *bucket = (char*)&slot->bucket; struct pfring_pkthdr *_hdr = (struct pfring_pkthdr*)bucket; int bktLen = _hdr->caplen+_hdr->parsed_header_len; if(bktLen > buffer_len) bktLen = buffer_len-1; if(buffer && (bktLen > 0)) {

memcpy(buffer, &bucket[sizeof(struct pfring_pkthdr)], bktLen); bucket[bktLen] = '\0';

}

2010/09/17 © by

pfring_read() (cont’ed) if(ring->slots_info->remove_idx >= (ring->slots_info->tot_slots-1)) {

ring->slots_info->remove_idx = 0;ring->page_id = PAGE_SIZE, ring->slot_id = 0, ring->pkts_per_page = 0;

} else {ring->slots_info->remove_idx++;ring->pkts_per_page++, ring->slot_id += ring->slots_info->slot_len;

} if(hdr) memcpy(hdr, _hdr, sizeof(struct pfring_pkthdr)); ring->slots_info->tot_read++; if(consume_packet_immediately) {

ring->last_slot_to_update = NULL, slot->slot_state = 0; /* Empty slot */ } else {

/* We do not notify pf_ring that the packet has been read hence this slot will not be available for storing a new packet until we notify pf_ring */

ring->last_slot_to_update = slot; }

2010/09/17 © by

pfring_read() (cont’ed) if(ring->reentrant) pthread_spin_unlock(&ring->spinlock); return(1); } else { if(ring->reentrant) pthread_spin_unlock(&ring->spinlock); if(wait_for_incoming_packet) { struct pollfd pfd; int rc;#ifdef USE_ADAPTIVE_WAIT

/* Spin in userland for a while and if no packet arrives then it's time to poll the kernel. Only do poll() if there is no chance to avoid it, as a call to poll() is too costly */ if(num_loops < MAX_NUM_LOOPS) { num_loops++; if(num_loops % YIELD_MULTIPLIER) { sched_yield(); } }

#endif

2010/09/17 © by

pfring_read() (cont’ed)/* Sleep when nothing is happening */pfd.fd = ring->fd;pfd.events = POLLIN|POLLERR;pfd.revents = 0;errno = 0;rc = poll(&pfd, 1, -1);ring->num_poll_calls++;if(rc == -1) return(-1);else goto do_pfring_recv;

} } return(-1); /* Not reached */ }}

2010/09/17 © by

pfcount.c main() { /* Omitted.. argument processing codes */ if(device == NULL) device = DEFAULT_DEVICE; if(num_threads > MAX_NUM_THREADS) num_threads=MAX_NUM_THREADS; printf("Capturing from %s\n", device); /* hardcode: promisc=1, to_ms=500 */ promisc = 1; if(num_threads > 0) pthread_rwlock_init(&statsLock, NULL); if(!dna_mode) pd = pfring_open(device, promisc, snaplen, (num_threads > 0) ? 1 : 0);#ifdef ENABLE_DNA_SUPPORT else pd = pfring_open_dna(device, 0 /* we don't use threads */);#endif /* Omitted … check pd to see if pfring_open() error */ /* Omitted … set filtering rule */

2010/09/17 © by

pfcount.c (cont’ed) signal(SIGINT, sigproc); signal(SIGTERM, sigproc); signal(SIGINT, sigproc); if(!verbose) { signal(SIGALRM, my_sigalarm); alarm(ALARM_SLEEP); } if(dna_mode) num_threads = 1; else { if(num_threads > 0) wait_for_packet = 1; } if(!wait_for_packet) pfring_enable_ring(pd); if(num_threads > 1) { pthread_t my_thread; int i; for(i=1; i<num_threads; i++) pthread_create(&my_thread, NULL, packet_consumer_thread, (void*)i); } packet_consumer_thread(0); pfring_close(pd); sleep(3); return(0);}

2010/09/17 © by

packet_consumer_thread()void* packet_consumer_thread(void* _id) { while(1) { struct simple_stats { u_int64_t num_pkts, num_bytes; }; u_char buffer[2048]; struct simple_stats stats; struct pfring_pkthdr hdr; int rc; u_int len; if(do_shutdown) break; if(pfring_recv(pd, (char*)buffer, sizeof(buffer), &hdr, wait_for_packet) > 0) { if(do_shutdown) break; dummyProcesssPacket(&hdr, buffer); } if(0) { len = sizeof(stats); rc = pfring_get_filtering_rule_stats(pd, 5, (char*)&stats, &len); if(rc < 0) printf("pfring_get_filtering_rule_stats() failed [rc=%d]\n", rc); else { printf("[Pkts=%u][Bytes=%u]\n", (unsigned int)stats.num_pkts, (unsigned in

t)stats.num_bytes); } } } return(NULL);}

2010/09/17 © by

Questions?