首先我们看看alloc_netdev :include/linux/netdevice.h
#define alloc_netdev(sizeof_priv, name, setup) \
alloc_netdev_mq(sizeof_priv, name, setup, 1)
点击(此处)折叠或打开
- /**
- * alloc_netdev_mq - allocate network device
- * @sizeof_priv: size of private data to allocate space for
- * @name: device name format string
- * @setup: callback to initialize device
- * @queue_count: the number of subqueues to allocate
- *
- * Allocates a struct net_device with private data area for driver use
- * and performs basic initialization. Also allocates subquue structs
- * for each queue on the device at the end of the netdevice.
- */
- alloc_netdev
- struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
- void (*setup)(struct net_device *), unsigned int queue_count)
- {
- struct netdev_queue *tx;
- struct net_device *dev;
- size_t alloc_size;
- struct net_device *p;
-
- BUG_ON(strlen(name) >= sizeof(dev->name));
-
- alloc_size = sizeof(struct net_device);
- if (sizeof_priv) {
- /* ensure 32-byte alignment of private area */
- alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
- alloc_size += sizeof_priv;
- }
- /* ensure 32-byte alignment of whole construct */
- alloc_size += NETDEV_ALIGN - 1;
-
- p = kzalloc(alloc_size, GFP_KERNEL);
- if (!p) {
- printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
- return NULL;
- }
-
- tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
- if (!tx) {
- printk(KERN_ERR "alloc_netdev: Unable to allocate "
- "tx qdiscs.\n");
- goto free_p;
- }
-
- dev = PTR_ALIGN(p, NETDEV_ALIGN);
- dev->padded = (char *)dev - (char *)p;
-
- if (dev_addr_init(dev))
- goto free_tx;
-
- dev_unicast_init(dev);
-
- dev_net_set(dev, &init_net);
-
- dev->_tx = tx;
- dev->num_tx_queues = queue_count;
- dev->real_num_tx_queues = queue_count;
-
- dev->gso_max_size = GSO_MAX_SIZE;
-
- netdev_init_queues(dev);
-
- INIT_LIST_HEAD(&dev->napi_list);
- dev->priv_flags = IFF_XMIT_DST_RELEASE;
- setup(dev);
- strcpy(dev->name, name);
- return dev;
-
- free_tx:
- kfree(tx);
-
- free_p:
- kfree(p);
- return NULL;
- }
这里我们关心一下
点击(此处)折叠或打开
- static void netdev_init_queues(struct net_device *dev)
- {
- netdev_init_one_queue(dev, &dev->rx_queue, NULL);
- netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
- spin_lock_init(&dev->tx_global_lock);
- }
当然还有很多变体api
这里我们也应该关注一下struct net_device这个结构体
点击(此处)折叠或打开
- /*
- * The DEVICE structure.
- * Actually, this whole structure is a big mistake. It mixes I/O
- * data with strictly "high-level" data, and it has to know about
- * almost every data structure used in the INET module.
- *
- * FIXME: cleanup struct net_device such that network protocol info
- * moves out.
- */
-
- struct net_device
- {
-
- /*
- * This is the first field of the "visible" part of this structure
- * (i.e. as seen by users in the "Space.c" file). It is the name
- * the interface.
- */
- char name[IFNAMSIZ];
- /* device name hash chain */
- struct hlist_node name_hlist;
- /* snmp alias */
- char *ifalias;
-
- /*
- * I/O specific fields
- * FIXME: Merge these and struct ifmap into one
- */
- unsigned long mem_end; /* shared mem end */
- unsigned long mem_start; /* shared mem start */
- unsigned long base_addr; /* device I/O address */
- unsigned int irq; /* device IRQ number */
-
- /*
- * Some hardware also needs these fields, but they are not
- * part of the usual set specified in Space.c.
- */
-
- unsigned char if_port; /* Selectable AUI, TP,..*/
- unsigned char dma; /* DMA channel */
-
- unsigned long state;
-
- struct list_head dev_list;
- struct list_head napi_list;
-
- /* Net device features */
- unsigned long features;
- #define NETIF_F_SG 1 /* Scatter/gather IO. */
- #define NETIF_F_IP_CSUM 2 /* Can checksum TCP/UDP over IPv4. */
- #define NETIF_F_NO_CSUM 4 /* Does not require checksum. F.e. loopack. */
- #define NETIF_F_HW_CSUM 8 /* Can checksum all the packets. */
- #define NETIF_F_IPV6_CSUM 16 /* Can checksum TCP/UDP over IPV6 */
- #define NETIF_F_HIGHDMA 32 /* Can DMA to high memory. */
- #define NETIF_F_FRAGLIST 64 /* Scatter/gather IO. */
- #define NETIF_F_HW_VLAN_TX 128 /* Transmit VLAN hw acceleration */
- #define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */
- #define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */
- #define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */
- #define NETIF_F_GSO 2048 /* Enable software GSO. */
- #define NETIF_F_LLTX 4096 /* LockLess TX - deprecated. Please */
- /* do not use LLTX in new drivers */
- #define NETIF_F_NETNS_LOCAL 8192 /* Does not change network namespaces */
- #define NETIF_F_GRO 16384 /* Generic receive offload */
- #define NETIF_F_LRO 32768 /* large receive offload */
-
- /* the GSO_MASK reserves bits 16 through 23 */
- #define NETIF_F_FCOE_CRC (1 24) /* FCoE CRC32 */
- #define NETIF_F_SCTP_CSUM (1 25) /* SCTP checksum offload */
- #define NETIF_F_FCOE_MTU (1 26) /* Supports max FCoE MTU, 2158 bytes*/
-
- /* Segmentation offload features */
- #define NETIF_F_GSO_SHIFT 16
- #define NETIF_F_GSO_MASK 0x00ff0000
- #define NETIF_F_TSO (SKB_GSO_TCPV4 NETIF_F_GSO_SHIFT)
- #define NETIF_F_UFO (SKB_GSO_UDP NETIF_F_GSO_SHIFT)
- #define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY NETIF_F_GSO_SHIFT)
- #define NETIF_F_TSO_ECN (SKB_GSO_TCP_ECN NETIF_F_GSO_SHIFT)
- #define NETIF_F_TSO6 (SKB_GSO_TCPV6 NETIF_F_GSO_SHIFT)
- #define NETIF_F_FSO (SKB_GSO_FCOE NETIF_F_GSO_SHIFT)
-
- /* List of features with software fallbacks. */
- #define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6)
-
-
- #define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
- #define NETIF_F_V4_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IP_CSUM)
- #define NETIF_F_V6_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM)
- #define NETIF_F_ALL_CSUM (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM)
-
- /*
- * If one device supports one of these features, then enable them
- * for all in netdev_increment_features.
- */
- #define NETIF_F_ONE_FOR_ALL (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \
- NETIF_F_SG | NETIF_F_HIGHDMA | \
- NETIF_F_FRAGLIST)
-
- /* Interface index. Unique device identifier */
- int ifindex;
- int iflink;
-
- struct net_device_stats stats;
-
- #ifdef CONFIG_WIRELESS_EXT
- /* List of functions to handle Wireless Extensions (instead of ioctl).
- * See net/iw_handler.h> for details. Jean II */
- const struct iw_handler_def * wireless_handlers;
- /* Instance data managed by the core of Wireless Extensions. */
- struct iw_public_data * wireless_data;
- #endif
- /* Management operations */
- const struct net_device_ops *netdev_ops;
- const struct ethtool_ops *ethtool_ops;
-
- /* Hardware header description */
- const struct header_ops *header_ops;
-
- unsigned int flags; /* interface flags (a la BSD) */
- unsigned short gflags;
- unsigned short priv_flags; /* Like 'flags' but invisible to userspace. */
- unsigned short padded; /* How much padding added by alloc_netdev() */
-
- unsigned char operstate; /* RFC2863 operstate */
- unsigned char link_mode; /* mapping policy to operstate */
-
- unsigned mtu; /* interface MTU value */
- unsigned short type; /* interface hardware type */
- unsigned short hard_header_len; /* hardware hdr length */
-
- /* extra head- and tailroom the hardware may need, but not in all cases
- * can this be guaranteed, especially tailroom. Some cases also use
- * LL_MAX_HEADER instead to allocate the skb.
- */
- unsigned short needed_headroom;
- unsigned short needed_tailroom;
-
- struct net_device *master; /* Pointer to master device of a group,
- * which this device is member of.
- */
-
- /* Interface address info. */
- unsigned char perm_addr[MAX_ADDR_LEN]; /* permanent hw address */
- unsigned char addr_len; /* hardware address length */
- unsigned short dev_id; /* for shared network cards */
-
- struct netdev_hw_addr_list uc; /* Secondary unicast
- mac addresses */
- int uc_promisc;
- spinlock_t addr_list_lock;
- struct dev_addr_list *mc_list; /* Multicast mac addresses */
- int mc_count; /* Number of installed mcasts */
- unsigned int promiscuity;
- unsigned int allmulti;
-
-
- /* Protocol specific pointers */
-
- #ifdef CONFIG_NET_DSA
- void *dsa_ptr; /* dsa specific data */
- #endif
- void *atalk_ptr; /* AppleTalk link */
- void *ip_ptr; /* IPv4 specific data */
- void *dn_ptr; /* DECnet specific data */
- void *ip6_ptr; /* IPv6 specific data */
- void *ec_ptr; /* Econet specific data */
- void *ax25_ptr; /* AX.25 specific data */
- struct wireless_dev *ieee80211_ptr; /* IEEE 802.11 specific data,
- assign before registering */
-
- /*
- * Cache line mostly used on receive path (including eth_type_trans())
- */
- unsigned long last_rx; /* Time of last Rx */
- /* Interface address info used in eth_type_trans() */
- unsigned char *dev_addr; /* hw address, (before bcast
- because most packets are
- unicast) */
-
- struct netdev_hw_addr_list dev_addrs; /* list of device
- hw addresses */
-
- unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
-
- struct netdev_queue rx_queue;
-
- struct netdev_queue *_tx ____cacheline_aligned_in_smp;
-
- /* Number of TX queues allocated at alloc_netdev_mq() time */
- unsigned int num_tx_queues;
-
- /* Number of TX queues currently active in device */
- unsigned int real_num_tx_queues;
-
- /* root qdisc from userspace point of view */
- struct Qdisc *qdisc;
-
- unsigned long tx_queue_len; /* Max frames per queue allowed */
- spinlock_t tx_global_lock;
- /*
- * One part is mostly used on xmit path (device)
- */
- /* These may be needed for future network-power-down code. */
-
- /*
- * trans_start here is expensive for high speed devices on SMP,
- * please use netdev_queue->trans_start instead.
- */
- unsigned long trans_start; /* Time (in jiffies) of last Tx */
-
- int watchdog_timeo; /* used by dev_watchdog() */
- struct timer_list watchdog_timer;
-
- /* Number of references to this device */
- atomic_t refcnt ____cacheline_aligned_in_smp;
-
- /* delayed register/unregister */
- struct list_head todo_list;
- /* device index hash chain */
- struct hlist_node index_hlist;
-
- struct net_device *link_watch_next;
-
- /* register/unregister state machine */
- enum { NETREG_UNINITIALIZED=0,
- NETREG_REGISTERED, /* completed register_netdevice */
- NETREG_UNREGISTERING, /* called unregister_netdevice */
- NETREG_UNREGISTERED, /* completed unregister todo */
- NETREG_RELEASED, /* called free_netdev */
- NETREG_DUMMY, /* dummy device for NAPI poll */
- } reg_state;
-
- /* Called from unregister, can be used to call free_netdev */
- void (*destructor)(struct net_device *dev);
-
- #ifdef CONFIG_NETPOLL
- struct netpoll_info *npinfo;
- #endif
-
- #ifdef CONFIG_NET_NS
- /* Network namespace this network device is inside */
- struct net *nd_net;
- #endif
-
- /* mid-layer private */
- void *ml_priv;
-
- /* bridge stuff */
- struct net_bridge_port *br_port;
- /* macvlan */
- struct macvlan_port *macvlan_port;
- /* GARP */
- struct garp_port *garp_port;
-
- /* class/net/name entry */
- struct device dev;
- /* space for optional statistics and wireless sysfs groups */
- const struct attribute_group *sysfs_groups[3];
-
- /* rtnetlink link ops */
- const struct rtnl_link_ops *rtnl_link_ops;
-
- /* VLAN feature mask */
- unsigned long vlan_features;
-
- /* for setting kernel sock attribute on TCP connection setup */
- #define GSO_MAX_SIZE 65536
- unsigned int gso_max_size;
-
- #ifdef CONFIG_DCB
- /* Data Center Bridging netlink ops */
- struct dcbnl_rtnl_ops *dcbnl_ops;
- #endif
-
- #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
- /* max exchange id for FCoE LRO by ddp */
- unsigned int fcoe_ddp_xid;
- #endif
- }
对于驱动的初始化操作这里不在说明,可以参考drivers/net下.
我们注册的设备,实际上是添加到了dev_base链表,可以通过dev_get_by_name和dev_get_by_index查询.
对于设备的状态,包括它的状态机一直没弄明白,觉的很神秘,这里就来看看.
设备状态:
Net_device中
Flags用于存储各种表示的位域。多数标示代表设备的能力。然而,其中之一的IFF_UP是用于指出该设备是开启或关闭,可以在include/linux/if.h中找到IFF_XX
这里实际例子就是dev_queue_xmit函数里
Reg_state 设备注册状态
在 界于netreg_uninitalized和netreg_registered之间。 由netdev_run_todo处理。
点击(此处)折叠或打开
- /* register/unregister state machine */
- enum { NETREG_UNINITIALIZED=0,
- NETREG_REGISTERED, /* completed register_netdevice */
- NETREG_UNREGISTERING, /* called unregister_netdevice */
- NETREG_UNREGISTERED, /* completed unregister todo */
- NETREG_RELEASED, /* called free_netdev */
- NETREG_DUMMY, /* dummy device for NAPI poll */
- } reg_state;
State和其队列规则有关的设备状态
点击(此处)折叠或打开
- /* These flag bits are private to the generic network queueing
- * layer, they may not be explicitly referenced by any other
- * code.
- */
-
- enum netdev_state_t
- {
- __LINK_STATE_START, // 设备开启 由 netif_running检查
- __LINK_STATE_PRESENT, //设备存在 netif_device_present 挂起到恢复继续时 需要操作
- __LINK_STATE_NOCARRIER, // 没载波 netif_carrior_ok 检查
- __LINK_STATE_LINKWATCH_PENDING,
- __LINK_STATE_DORMANT,
- };
点击(此处)折叠或打开
- /*
- * Default initial state at registry is that the
- * device is present.
- */
-
- set_bit(__LINK_STATE_PRESENT, &dev->state);
队列规则状态:(个人对这一块比较感兴趣,因为涉及qos等)
每个设备都会被分配一种队列规则,流量控制以此实现其qos机制。
队列规则即qos 是在register_netdevice中初始化的
由函数dev_init_scheduler(dev);来处理.
点击(此处)折叠或打开
- void dev_init_scheduler(struct net_device *dev)
- {
- dev->qdisc = &noop_qdisc;
- netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
- dev_init_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
-
- setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
- }
而关于noop_qdisc. 我们在看netif_recevice_skb时会看到
点击(此处)折叠或打开
- #ifdef CONFIG_NET_CLS_ACT
- skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
- if (!skb)
- goto out;
- ncls:
- #endif
点击(此处)折叠或打开
- #ifdef CONFIG_NET_CLS_ACT
- /* TODO: Maybe we should just force sch_ingress to be compiled in
- * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
- * a compare and 2 stores extra right now if we dont have it on
- * but have CONFIG_NET_CLS_ACT
- * NOTE: This doesnt stop any functionality; if you dont have
- * the ingress scheduler, you just cant add policies on ingress.
- *
- */
- static int ing_filter(struct sk_buff *skb)
- {
- struct net_device *dev = skb->dev;
- u32 ttl = G_TC_RTTL(skb->tc_verd);
- struct netdev_queue *rxq;
- int result = TC_ACT_OK;
- struct Qdisc *q;
-
- if (MAX_RED_LOOP ttl++) {
- printk(KERN_WARNING
- "Redir loop detected Dropping packet (%d->%d)\n",
- skb->iif, dev->ifindex);
- return TC_ACT_SHOT;
- }
-
- skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
- skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
-
- rxq = &dev->rx_queue;
-
- q = rxq->qdisc;
- if (q != &noop_qdisc) {
- spin_lock(qdisc_lock(q));
- if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
- result = qdisc_enqueue_root(skb, q);
- spin_unlock(qdisc_lock(q));
- }
-
- return result;
- }
-
- static inline struct sk_buff *handle_ing(struct sk_buff *skb,
- struct packet_type **pt_prev,
- int *ret, struct net_device *orig_dev)
- {
- if (skb->dev->rx_queue.qdisc == &noop_qdisc)
- goto out;
-
- if (*pt_prev) {
- *ret = deliver_skb(skb, *pt_prev, orig_dev);
- *pt_prev = NULL;
- } else {
- /* Huh? Why does turning on AF_PACKET affect this? */
- skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
- }
-
- switch (ing_filter(skb)) {
- case TC_ACT_SHOT:
- case TC_ACT_STOLEN:
- kfree_skb(skb);
- return NULL;
- }
-
- out:
- skb->tc_verd = 0;
- return skb;
- }
- #endif
点击(此处)折叠或打开
- static void dev_init_scheduler_queue(struct net_device *dev,
- struct netdev_queue *dev_queue,
- void *_qdisc)
- {
- struct Qdisc *qdisc = _qdisc;
-
- dev_queue->qdisc = qdisc;
- dev_queue->qdisc_sleeping = qdisc;
- }
而队列的最开始初始化是在net/sched/sch_api.c中
点击(此处)折叠或打开
- static int __init pktsched_init(void)
- {
- register_qdisc(&pfifo_qdisc_ops);
- register_qdisc(&bfifo_qdisc_ops);
- register_qdisc(&mq_qdisc_ops);
- proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
-
- rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
- rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
- rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
- rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
- rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
-
- return 0;
- }
-
- subsys_initcall(pktsched_init);
rtnl_unlock时 会调用netdev_run_todo来执行任务.
至于注册中的rtNetlink通知链这里不说!
设备引用计数 Dev->refcnt 初始化值为1 .
操作函数:dev_put 、dev_hold
很多时候,我们会需要动态的注销一些设备,这个时候在注销的时候 会发送通知让其他引用的子系统释放掉引用.
那么就需要netdev_run_todo 定时调用netdev_wait_allrefs来检查.它会主动发送NETREG_UNREGISTERED通知信息给netdev_chain,直到引用为0.
开启和关闭网络设备需要具体做那些工作以及他们之间的顺序?
1. 调用dev->open
2. 设置dev->state :__LINK_STATE_START
3. 设置dev->flags中IFF_UP
4. 调用dev_activate初始化由流量控制使用的出口队列规则,然后启动watchdog
5. 传送NETDEV_UP通知链netdev_chain ,告知其他内核组件做出反应
关闭的流程和open相反
当然这里和注册和注销所做是不同的事情。
涉及到netif_device_detach的作用 挂起的时候。
恢复的时候 用 netif_device_attach
对网络设备操作的常用工具:
ifconfig、ethtool、iproute2 /mii-tools
ifconfig 工作原理?
Socket ioctl ----> Dev_ioctl
虚拟设备
当然这篇文章很琐碎很杂,不过大部分设备初始化或者注册什么都说到了. 这里写它,只是为了使心中模糊的东西,更加清晰化.