Files
agnos-kernel-sdm845/net/core/dev.c
Channagoud Kadabi 8810e5fa00 Merge remote-tracking branch 'origin/tmp-dcb6110' into 4.8
* origin/tmp-dcb6110:
  ANDROID: goldfish_sync: 32 max cmds to save stack
  ANDROID: sched/walt: use div_u64 instead of do_div
  Linux 4.9.9
  drm/i915/execlists: Reset RING registers upon resume
  fs: break out of iomap_file_buffered_write on fatal signals
  iw_cxgb4: set correct FetchBurstMax for QPs
  x86/irq: Make irq activate operations symmetric
  irqdomain: Avoid activating interrupts more than once
  iio: health: max30100: fixed parenthesis around FIFO count check
  iio: dht11: Use usleep_range instead of msleep for start signal
  iio: health: afe4403: retrieve a valid iio_dev in suspend/resume
  iio: health: afe4404: retrieve a valid iio_dev in suspend/resume
  iio: adc: palmas_gpadc: retrieve a valid iio_dev in suspend/resume
  staging: greybus: timesync: validate platform state callback
  USB: serial: option: add device ID for HP lt2523 (Novatel E371)
  usb: gadget: f_fs: Assorted buffer overflow checks.
  usb: musb: Fix host mode error -71 regression
  USB: Add quirk for WORLDE easykey.25 MIDI keyboard
  USB: serial: pl2303: add ATEN device ID
  USB: serial: qcserial: add Dell DW5570 QDL
  KVM: x86: do not save guest-unsupported XSAVE state
  dmaengine: cppi41: Fix oops in cppi41_runtime_resume
  dmaengine: cppi41: Fix runtime PM timeouts with USB mass storage
  perf/x86/intel/uncore: Clean up hotplug conversion fallout
  HID: wacom: Fix poor prox handling in 'wacom_pl_irq'
  HID: hid-lg: Fix immediate disconnection of Logitech Rumblepad 2
  HID: usbhid: Quirk a AMI virtual mouse and keyboard with ALWAYS_POLL
  iwlwifi: mvm: avoid crash on restart w/o reserved queues
  iwlwifi: fix double hyphen in MODULE_FIRMWARE for 8000
  pinctrl: intel: merrifield: Add missed check in mrfld_config_set()
  pinctrl: baytrail: Debounce register is one per community
  Revert "vring: Force use of DMA API for ARM-based systems with legacy devices"
  Revert "bcma: init serial console directly from ChipCommon code"
  percpu-refcount: fix reference leak during percpu-atomic transition
  regulator: axp20x: AXP806: Fix dcdcb being set instead of dcdce
  vhost: fix initialization for vq->is_le
  mmc: sdhci: Ignore unexpected CARD_INT interrupts
  cgroup: don't online subsystems before cgroup_name/path() are operational
  can: bcm: fix hrtimer/tasklet termination in bcm op removal
  tracing: Fix hwlat kthread migration
  mm, fs: check for fatal signals in do_generic_file_read()
  base/memory, hotplug: fix a kernel oops in show_valid_zones()
  mm/memory_hotplug.c: check start_pfn in test_pages_in_a_zone()
  cifs: initialize file_info_lock
  zswap: disable changing params if init fails
  svcrpc: fix oops in absence of krb5 module
  NFSD: Fix a null reference case in find_or_create_lock_stateid()
  powerpc/mm: Use the correct pointer when setting a 2MB pte
  powerpc: Fix build failure with clang due to BUILD_BUG_ON()
  powerpc: Add missing error check to prom_find_boot_cpu()
  powerpc/eeh: Fix wrong flag passed to eeh_unfreeze_pe()
  libata: Fix ATA request sense
  libata: apply MAX_SEC_1024 to all CX1-JB*-HP devices
  ata: sata_mv:- Handle return value of devm_ioremap.
  perf/core: Fix PERF_RECORD_MMAP2 prot/flags for anonymous memory
  perf/core: Fix use-after-free bug
  crypto: arm64/aes-blk - honour iv_out requirement in CBC and CTR modes
  crypto: api - Clear CRYPTO_ALG_DEAD bit before registering an alg
  drm/nouveau/nv1a,nv1f/disp: fix memory clock rate retrieval
  drm/nouveau/disp/gt215: Fix HDA ELD handling (thus, HDMI audio) on gt215
  drm/amdgpu/si: fix crash on headless asics
  pinctrl: baytrail: Add missing spinlock usage in byt_gpio_irq_handler
  HID: cp2112: fix gpio-callback error handling
  HID: cp2112: fix sleep-while-atomic
  xtensa: fix noMMU build on cores with MMU
  efi/fdt: Avoid FDT manipulation after ExitBootServices()
  x86/efi: Always map the first physical page into the EFI pagetables
  ext4: validate s_first_meta_bg at mount time
  PCI/ASPM: Handle PCI-to-PCIe bridges as roots of PCIe hierarchies
  ANDROID: sched: Add Kconfig option DEFAULT_USE_ENERGY_AWARE to set ENERGY_AWARE feature flag
  ANDROID: goldfish_sync: Fix sync_file_obj is NULL but dereferenced problem
  ANDROID: goldfish_sync: Isolate single module to fix compilation
  ANDROID: goldfish_sync: update defconfig for 4.9-compatible version
  ANDROID: goldfish_sync: upgrade to new fence sync api
  Linux 4.9.8
  xfs: fix bmv_count confusion w/ shared extents
  xfs: clear _XBF_PAGES from buffers when readahead page
  xfs: extsize hints are not unlikely in xfs_bmap_btalloc
  xfs: remove racy hasattr check from attr ops
  xfs: verify dirblocklog correctly
  xfs: fix COW writeback race
  xfs: fix xfs_mode_to_ftype() prototype
  xfs: don't wrap ID in xfs_dq_get_next_id
  xfs: sanity check inode di_mode
  xfs: sanity check inode mode when creating new dentry
  xfs: replace xfs_mode_to_ftype table with switch statement
  xfs: add missing include dependencies to xfs_dir2.h
  xfs: sanity check directory inode di_size
  xfs: make the ASSERT() condition likely
  xfs: don't print warnings when xfs_log_force fails
  xfs: don't rely on ->total in xfs_alloc_space_available
  xfs: adjust allocation length in xfs_alloc_space_available
  xfs: fix bogus minleft manipulations
  xfs: bump up reserved blocks in xfs_alloc_set_aside
  net: dsa: Bring back device detaching in dsa_slave_suspend()
  lwtunnel: Fix oops on state free after encap module unload
  net: Specify the owning module for lwtunnel ops
  qmi_wwan/cdc_ether: add device ID for HP lt2523 (Novatel E371) WWAN card
  af_unix: move unix_mknod() out of bindlock
  r8152: don't execute runtime suspend if the tx is not empty
  net: mpls: Fix multipath selection for LSR use case
  bridge: netlink: call br_changelink() during br_dev_newlink()
  net/mlx5e: Do not recycle pages from emergency reserve
  tcp: initialize max window for a new fastopen socket
  ipv6: addrconf: Avoid addrconf_disable_change() using RCU read-side lock
  lwtunnel: fix autoload of lwt modules
  net: phy: bcm63xx: Utilize correct config_intr function
  net: fix harmonize_features() vs NETIF_F_HIGHDMA
  vxlan: fix byte order of vxlan-gpe port number
  virtio-net: restore VIRTIO_HDR_F_DATA_VALID on receiving
  virtio: don't set VIRTIO_NET_HDR_F_DATA_VALID on xmit
  net sched actions: fix refcnt when GETing of action after bind
  ax25: Fix segfault after sock connection timeout
  ip6_tunnel: Account for tunnel header in tunnel MTU
  ravb: do not use zero-length alignment DMA descriptor
  mlx4: do not call napi_schedule() without care
  openvswitch: maintain correct checksum state in conntrack actions
  tcp: fix tcp_fastopen unaligned access complaints on sparc
  net: systemport: Decouple flow control from __bcm_sysport_tx_reclaim
  net: ipv4: fix table id in getroute response
  net: lwtunnel: Handle lwtunnel_fill_encap failure
  mlxsw: pci: Fix EQE structure definition
  mlxsw: switchx2: Fix memory leak at skb reallocation
  mlxsw: spectrum: Fix memory leak at skb reallocation
  netvsc: add rcu_read locking to netvsc callback
  r8152: fix the sw rx checksum is unavailable
  FROMLIST: 9p: fix a potential acl leak
  ANDROID: sched/walt: use do_div instead of division operator
  ANDROID: sched: fix wrong truncation of walt_avg
  ANDROID: arm: Fix #if/#ifdef typo in topology.c
  ANDROID: arm: Fix build error "conflicting types for 'scale_cpu_capacity'"
  ANDROID: net: ipv6: remove unused variable ifindex in
  ANDROID: DEBUG: cpufreq: fix cpu_capacity tracing build for non-smp systems
  ANDROID: arm: topology: Define TC2 energy and provide it to the scheduler
  ANDROID: binder: fix format specifier for type binder_size_t
  Linux 4.9.7
  drm/i915: Remove WaDisableLSQCROPERFforOCL KBL workaround.
  perf/core: Fix concurrent sys_perf_event_open() vs. 'move_group' race
  mm, memcg: do not retry precharge charges
  platform/x86: intel_mid_powerbtn: Set IRQ_ONESHOT
  platform/x86: mlx-platform: free first dev on error
  virtio_mmio: Set DMA masks appropriately
  memory_hotplug: make zone_can_shift() return a boolean value
  pinctrl: baytrail: Rectify debounce support
  pinctrl: uniphier: fix Ethernet (RMII) pin-mux setting for LD20
  pinctrl: broxton: Use correct PADCFGLOCK offset
  s5k4ecgx: select CRC32 helper
  IB/rxe: Prevent from completer to operate on non valid QP
  IB/rxe: Fix rxe dev insertion to rxe_dev_list
  IB/umem: Release pid in error and ODP flow
  drm/i915: Check for NULL atomic state in intel_crtc_disable_noatomic()
  drm/i915: Fix calculation of rotated x and y offsets for planar formats
  drm/i915: Don't init hpd polling for vlv and chv from runtime_suspend()
  drm/i915: Don't leak edid in intel_crt_detect_ddc()
  drm/i915: prevent crash with .disable_display parameter
  drm/i915: Clear ret before unbinding in i915_gem_evict_something()
  v4l: tvp5150: Don't override output pinmuxing at stream on/off time
  v4l: tvp5150: Fix comment regarding output pin muxing
  v4l: tvp5150: Reset device at probe time, not in get/set format handlers
  pctv452e: move buffer to heap, no mutex
  iw_cxgb4: free EQ queue memory on last deref
  SUNRPC: cleanup ida information when removing sunrpc module
  NFSv4.0: always send mode in SETATTR after EXCLUSIVE4
  NFSv4.1: Fix a deadlock in layoutget
  nfs: Don't increment lock sequence ID after NFS4ERR_MOVED
  parisc: Don't use BITS_PER_LONG in userspace-exported swab.h header
  ARC: [arcompact] handle unaligned access delay slot corner case
  ARC: udelay: fix inline assembler by adding LP_COUNT to clobber list
  can: ti_hecc: add missing prepare and unprepare of the clock
  can: c_can_pci: fix null-pointer-deref in c_can_start() - set device pointer
  IB/srp: fix invalid indirect_sg_entries parameter value
  IB/srp: fix mr allocation when the device supports sg gaps
  IB/iser: Fix sg_tablesize calculation
  IB/cxgb3: fix misspelling in header guard
  s390/ptrace: Preserve previous registers for short regset write
  s390/mm: Fix cmma unused transfer from pgste into pte
  RDMA/cma: Fix unknown symbol when CONFIG_IPV6 is not enabled
  Btrfs: remove ->{get, set}_acl() from btrfs_dir_ro_inode_operations
  Btrfs: disable xattr operations on subvolume directories
  Btrfs: remove old tree_root case in btrfs_read_locked_inode()
  ISDN: eicon: silence misleading array-bounds warning
  xfs: prevent quotacheck from overloading inode lru
  sysctl: fix proc_doulongvec_ms_jiffies_minmax()
  userns: Make ucounts lock irq-safe
  vring: Force use of DMA API for ARM-based systems with legacy devices
  mm, page_alloc: fix premature OOM when racing with cpuset mems update
  mm, page_alloc: move cpuset seqcount checking to slowpath
  mm, page_alloc: fix fast-path race with cpuset update or removal
  mm, page_alloc: fix check for NULL preferred_zone
  mm/mempolicy.c: do not put mempolicy before using its nodemask
  mm/huge_memory.c: respect FOLL_FORCE/FOLL_COW for thp
  drm/atomic: clear out fence when duplicating state
  Revert "drm/radeon: always apply pci shutdown callbacks"
  drm/vc4: fix a bounds check
  drm/vc4: Return -EINVAL on the overflow checks failing.
  drm/vc4: Fix an integer overflow in temporary allocation layout.
  drm/vc4: Fix memory leak of the CRTC state.
  drm/i915: Ignore bogus plane coordinates on SKL when the plane is not visible
  drm: Fix broken VT switch with video=1366x768 option
  drm: Schedule the output_poll_work with 1s delay if we have delayed event
  tile/ptrace: Preserve previous registers for short regset write
  fbdev: color map copying bounds checking
  ANDROID: sched/walt: Drop arch-specific timer access
  ANDROID: sched/walt: include missing header for arm_timer_read_counter()
  ANDROID: fs: Export vfs_rmdir2
  ANDROID: fs: Export free_fs_struct and set_fs_pwd
  ANDROID: cpufreq: interactive: Use idle-end notifiers
  FROMLIST: cpufreq: Add android's 'interactive' governor
  ANDROID: cpufreq: conservative: fix duplicate 'static' error
  ANDROID: sdcardfs: eliminate the offset argument to ->direct_IO
  ANDROID: sdcardfs: make it use new .rename i_op
  ANDROID: sdcardfs: Propagate dentry down to inode_change_ok()
  ANDROID: sdcardfs: get rid of 'parent' argument of ->d_compare()
  ANDROID: sdcardfs: add parent pointer into dentry name hash
  ANDROID: sdcardfs: use wrappers to access i_mutex
  ANDROID: mnt: remount should propagate to slaves of slaves
  ANDROID: sdcardfs: Fix locking issue with permision fix up
  ANDROID: sdcardfs: Switch ->d_inode to d_inode()
  ANDROID: sdcardfs: Change magic value
  ANDROID: sdcardfs: Use per mount permissions
  ANDROID: sdcardfs: Add gid and mask to private mount data
  ANDROID: sdcardfs: User new permission2 functions
  ANDROID: vfs: Add permission2 for filesystems with per mount permissions
  ANDROID: vfs: Add setattr2 for filesystems with per mount permissions
  ANDROID: vfs: Allow filesystems to access their private mount data
  ANDROID: mnt: Add filesystem private data to mount points
  ANDROID: sdcardfs: Move directory unlock before touch
  ANDROID: sdcardfs: fix external storage exporting incorrect uid
  ANDROID: sdcardfs: Added top to sdcardfs_inode_info
  ANDROID: sdcardfs: Switch package list to RCU
  ANDROID: sdcardfs: Fix locking for permission fix up
  ANDROID: sdcardfs: Check for other cases on path lookup
  ANDROID: sdcardfs: override umask on mkdir and create
  ANDROID: sched/debug: Add energy procfs interface
  ANDROID: cpufreq: sched: Fix kernel crash on accessing sysfs file
  ANDROID: FIXUP: sched/tune: add fixes missing from a previous patch
  ANDROID: sched: tune: Fix lacking spinlock initialization
  ANDROID: cgroup: Remove leftover instances of allow_attach
  ANDROID: FIXUP: sched: scheduler-driven cpu frequency selection
  ANDROID: sched/rt: Add Kconfig option to enable panicking for RT throttling
  ANDROID: sched/rt: print RT tasks when RT throttling is activated
  ANDROID: sched/fair: Favor higher cpus only for boosted tasks
  ANDROID: sched/fair: call OPP update when going idle after migration
  ANDROID: sched/cpufreq_sched: fix thermal capping events
  ANDROID: sched/fair: Picking cpus with low OPPs for tasks that prefer idle CPUs
  ANDROID: FIXUP: sched/tune: do initialization as a postcore_initicall
  ANDROID: DEBUG: sched: add tracepoint for RD overutilized
  ANDROID: sched/tune: Introducing a new schedtune attribute prefer_idle
  ANDROID: sched: use util instead of capacity to select busy cpu
  ANDROID: arch_timer: add error handling when the MPM global timer is cleared
  ANDROID: FIXUP: sched: Fix double-release of spinlock in move_queued_task
  ANDROID: FIXUP: sched/fair: Fix hang during suspend in sched_group_energy
  ANDROID: FIXUP: sched: fix SchedFreq integration for both PELT and WALT
  ANDROID: sched: EAS: Avoid causing spikes to max-freq unnecessarily
  ANDROID: FIXUP: sched: fix set_cfs_cpu_capacity when WALT is in use
  ANDROID: sched/walt: Accounting for number of irqs pending on each core
  ANDROID: sched: Introduce Window Assisted Load Tracking (WALT)
  ANDROID: sched/tune: fix PB and PC cuts indexes definition
  ANDROID: sched/fair: optimize idle cpu selection for boosted tasks
  ANDROID: FIXUP: sched/tune: fix accounting for runnable tasks
  ANDROID: sched/tune: use a single initialisation function
  ANDROID: FIXUP: sched/tune: fix payoff calculation for boost region
  ANDROID: sched/tune: Add support for negative boost values
  ANDROID: FIX: sched/tune: move schedtune_nornalize_energy into fair.c
  ANDROID: FIX: sched/tune: update usage of boosted task utilisation on CPU selection
  ANDROID: sched/fair: add tunable to set initial task load
  ANDROID: sched/fair: add tunable to force selection at cpu granularity
  ANDROID: sched: EAS: take cstate into account when selecting idle core
  ANDROID: sched/cpufreq_sched: Consolidated update
  ANDROID: FIXUP: sched: fix build for non-SMP target
  ANDROID: DEBUG: sched/tune: add tracepoint on P-E space filtering
  ANDROID: DEBUG: sched/tune: add tracepoint for energy_diff() values
  ANDROID: DEBUG: sched/tune: add tracepoint for task boost signal
  CHROMIUM: sched: update the average of nr_running
  ANDROID: DEBUG: schedtune: add tracepoint for schedtune_tasks_update() values
  ANDROID: DEBUG: schedtune: add tracepoint for CPU boost signal
  ANDROID: DEBUG: schedtune: add tracepoint for SchedTune configuration update
  ANDROID: DEBUG: sched,cpufreq: add cpu_capacity change tracepoint
  ANDROID: DEBUG: sched: add tracepoint for CPU load/util signals
  ANDROID: DEBUG: sched: add tracepoint for task load/util signals
  ANDROID: DEBUG: sched: add tracepoint for cpu/freq scale invariance
  ANDROID: sched/fair: filter energy_diff() based on energy_payoff value
  ANDROID: sched/tune: add support to compute normalized energy
  ANDROID: sched/fair: keep track of energy/capacity variations
  ANDROID: sched/fair: add boosted task utilization
  ANDROID: sched/{fair,tune}: track RUNNABLE tasks impact on per CPU boost value
  ANDROID: sched/tune: compute and keep track of per CPU boost value
  ANDROID: sched/tune: add initial support for CGroups based boosting
  ANDROID: sched/fair: add boosted CPU usage
  ANDROID: sched/fair: add function to convert boost value into "margin"
  ANDROID: sched/tune: add sysctl interface to define a boost value
  ANDROID: sched/tune: add detailed documentation
  ANDROID: fixup! sched: scheduler-driven cpu frequency selection
  ANDROID: sched: remove call of sched_avg_update from sched_rt_avg_update
  ANDROID: sched/cpufreq_sched: add trace events
  ANDROID: sched/fair: jump to max OPP when crossing UP threshold
  ANDROID: sched/fair: cpufreq_sched triggers for load balancing
  ANDROID: sched/{core,fair}: trigger OPP change request on fork()
  ANDROID: sched/fair: add triggers for OPP change requests
  ANDROID: sched: scheduler-driven cpu frequency selection
  ANDROID: cpufreq: introduce cpufreq_driver_is_slow
  ANDROID: sched: Add group_misfit_task load-balance type
  ANDROID: sched: Add per-cpu max capacity to sched_group_capacity
  ANDROID: sched: Do eas idle balance regardless of the rq avg idle value
  ANDROID: arm64: Enable max freq invariant scheduler load-tracking and capacity support
  ANDROID: arm: Enable max freq invariant scheduler load-tracking and capacity support
  ANDROID: sched: Update max cpu capacity in case of max frequency constraints
  ANDROID: cpufreq: Max freq invariant scheduler load-tracking and cpu capacity support
  ANDROID: sched: Disable energy-unfriendly nohz kicks
  ANDROID: sched: Consider a not over-utilized energy-aware system as balanced
  ANDROID: sched: Energy-aware wake-up task placement
  ANDROID: sched: Determine the current sched_group idle-state
  ANDROID: sched, cpuidle: Track cpuidle state index in the scheduler
  ANDROID: sched: Add over-utilization/tipping point indicator
  ANDROID: sched: Estimate energy impact of scheduling decisions
  ANDROID: sched: Extend sched_group_energy to test load-balancing decisions
  ANDROID: sched: Calculate energy consumption of sched_group
  ANDROID: sched: Highest energy aware balancing sched_domain level pointer
  ANDROID: sched: Relocated cpu_util() and change return type
  ANDROID: sched: Compute cpu capacity available at current frequency
  ANDROID: sched: Support for extracting EAS energy costs from DT
  ANDROID: arm64, topology: Updates to use DT bindings for EAS costing data
  ANDROID: arm64: Cpu invariant scheduler load-tracking and capacity support
  ANDROID: arm: Cpu invariant scheduler load-tracking and capacity support
  ANDROID: sched: Introduce SD_SHARE_CAP_STATES sched_domain flag
  ANDROID: sched: Initialize energy data structures
  ANDROID: sched: Make energy awareness a sched feature
  ANDROID: sched: Documentation for scheduler energy cost model
  ANDROID: sched: Prevent unnecessary active balance of single task in sched group
  ANDROID: sched: Enable idle balance to pull single task towards cpu with higher capacity
  ANDROID: sched: Consider spare cpu capacity at task wake-up
  ANDROID: sched: Add cpu capacity awareness to wakeup balancing
  ANDROID: arm: Update arch_scale_cpu_capacity() to reflect change to define
  ANDROID: arm64: Enable frequency invariant scheduler load-tracking support
  ANDROID: arm: Enable frequency invariant scheduler load-tracking support
  ANDROID: cpufreq: Frequency invariant scheduler load-tracking support
  ANDROID: [CPUFREQ] Don't export governors for default governor
  ANDROID: kernel/configs: recommended: CONFIG_ARM64_SW_TTBR0_PAN=y
  ANDROID: kernel/configs: base: Enable QUOTA related configs
  ANDROID: kernel/configs: recommended: Enable MEMORY_STATE_TIME
  FROMLIST: config: android-base: enable hardened usercopy and kernel ASLR
  FROMLIST: config: android-recommended: disable aio support
  ANDROID: kernel/configs: recommended: enable fstack-protector-strong
  ANDROID: kernel/configs: base: enable UID_CPUTIME
  ANDROID: kernel/configs: base: restrict access to perf events
  ANDROID: configs: base: enable configfs gadget functions
  ANDROID: configs: merge AOSP config fragments
  ANDROID: Implement memory_state_time, used by qcom,cpubw
  ANDROID: dm: rebase for 4.9
  ANDROID: usb: otg-wakelock: Remove wakelock.h dependencies
  ANDROID: gpio_matrix: Remove wakelock.h dependencies
  ANDROID: fiq_debugger: Remove wakelock.h dependencies
  UPSTREAM: net: socket: don't set sk_uid to garbage value in ->setattr()
  ANDROID: trace: net: use %pK for kernel pointers
  UPSTREAM: net: ipv4: Don't crash if passing a null sk to ip_rt_update_pmtu.
  UPSTREAM: net: inet: Support UID-based routing in IP protocols.
  UPSTREAM: net: core: add UID to flows, rules, and routes
  UPSTREAM: net: core: Add a UID field to struct sock.
  ANDROID: fs: FS tracepoints to track IO.
  ANDROID: MMC/UFS IO Latency Histograms.
  CHROMIUM: fix warning when releasing active sync point
  ANDROID: goldfish_pipe: fix allmodconfig build
  ANDROID: goldfish: goldfish_pipe: fix locking errors
  ANDROID: goldfish_pipe: fix call_kern.cocci warnings
  ANDROID: goldfish_pipe: An implementation of more parallel pipe
  ANDROID: goldfish_pipe: bugfixes and performance improvements.
  ANDROID: goldfish: disable GOLDFISH_SYNC
  ANDROID: goldfish: enable CONFIG_INET_DIAG_DESTROY
  ANDROID: build: fix build config kernel_dir
  ANDROID: dm verity: add minimum prefetch size
  ANDROID: build: add build server configs for goldfish
  UPSTREAM: trace: Update documentation for mono, mono_raw and boot clock
  UPSTREAM: trace: Add an option for boot clock as trace clock
  UPSTREAM: timekeeping: Add a fast and NMI safe boot clock
  ANDROID: video: goldfishfb: fix platform_no_drv_owner.cocci warnings
  ANDROID: arm64: rename ranchu defconfig to ranchu64
  ANDROID: arch: x86: disable pic for Android toolchain
  ANDROID: goldfish: Add goldfish sync driver
  ANDROID: goldfish: add ranchu defconfigs
  ANDROID: goldfish_audio: Clear audio read buffer status after each read
  ANDROID: goldfish_events: no extra EV_SYN; register goldfish
  ANDROID: goldfish_fb: Set pixclock = 0
  ANDROID: goldfish: Enable ACPI-based enumeration for goldfish audio
  ANDROID: goldfish: Enable ACPI-based enumeration for goldfish framebuffer
  ANDROID: video: goldfishfb: add devicetree bindings
  ANDROID: usb: gadget: function: cleanup: Add blank line after declaration
  ANDROID: usb: gadget: f_mtp: simplify ptp NULL pointer check
  ANDROID: usb: gadget: audio_source: fix comparison of distinct pointer types
  ANDROID: binder: support for file-descriptor arrays.
  ANDROID: binder: support for scatter-gather.
  ANDROID: binder: add extra size to allocator.
  ANDROID: binder: refactor binder_transact()
  ANDROID: binder: support multiple /dev instances.
  ANDROID: binder: deal with contexts in debugfs.
  ANDROID: binder: support multiple context managers.
  ANDROID: binder: split flat_binder_object.
  ANDROID: [RFC]cgroup: Change from CAP_SYS_NICE to CAP_SYS_RESOURCE for cgroup migration permissions
  CHROMIUM: cgroups: relax permissions on moving tasks between cgroups
  ANDROID: dm: android-verity: Remove fec_header location constraint
  ANDROID: fiq_debugger: Pass task parameter to unwind_frame()
  ANDROID: input: keyreset: switch to orderly_reboot
  ANDROID: cpuset: Make cpusets restore on hotplug
  ANDROID: Don't show empty tag stats for unprivileged uids
  ANDROID: dm: android-verity: Allow android-verity to be compiled as an independent module
  ANDROID: dm-verity: adopt changes made to dm callbacks
  ANDROID: dm verity fec: pack the fec_header structure
  ANDROID: dm: android-verity: Verify header before fetching table
  ANDROID: dm: allow adb disable-verity only in userdebug
  ANDROID: dm: mount as linear target if eng build
  ANDROID: dm: use default verity public key
  ANDROID: dm: fix signature verification flag
  ANDROID: dm: use name_to_dev_t
  ANDROID: dm: rename dm-linear methods for dm-android-verity
  ANDROID: dm: Minor cleanup
  ANDROID: dm: Mounting root as linear device when verity disabled
  ANDROID: dm-android-verity: Rebase on top of 4.1
  ANDROID: dm: Add android verity target
  ANDROID: dm: fix dm_substitute_devices()
  ANDROID: dm: Rebase on top of 4.9
  CHROMIUM: dm: boot time specification of dm=
  ANDROID: usb: gadget: f_accessory: remove duplicate endpoint alloc
  ANDROID: sdcardfs: fix itnull.cocci warnings
  ANDROID: sdcardfs: Truncate packages_gid.list on overflow
  ANDROID: netfilter: xt_quota2: make quota2_log work well
  ANDROID: cpu: send KOBJ_ONLINE event when enabling cpus
  ANDROID: dm verity fec: initialize recursion level
  ANDROID: dm verity fec: add missing release from fec_ktype
  ANDROID: dm verity fec: limit error correction recursion
  FROMLIST: security,perf: Allow further restriction of perf_event_open
  ANDROID: ARM64: Ignore Image-dtb from git point of view
  ANDROID: arm64: add option to build Image-dtb
  ANDROID: usb: gadget: f_midi: set fi->f to NULL when free f_midi function
  ANDROID: xt_qtaguid: Fix panic caused by processing non-full socket.
  ANDROID: fiq_debugger: Add fiq_debugger.disable option
  FROMLIST: wlcore: Disable filtering in AP role
  ANDROID: fiq_debugger: Add option to apply uart overlay by FIQ_DEBUGGER_UART_OVERLAY
  ANDROID: usb: dual-role: make stub functions inline
  ANDROID: quick selinux support for tracefs
  ANDROID: xt_qtaguid: Fix panic caused by synack processing
  ANDROID: fuse: Add support for d_canonical_path
  ANDROID: vfs: change d_canonical_path to take two paths
  ANDROID: netfilter: xt_qtaguid: seq_printf fixes
  ANDROID: mmc: Add CONFIG_MMC_SIMULATE_MAX_SPEED
  ANDROID: dm verity fec: add sysfs attribute fec/corrected
  ANDROID: mm: Export do_munmap
  ANDROID: sdcardfs: remove unneeded __init and __exit
  ANDROID: sdcardfs: Remove unused code
  ANDROID: fs: Export d_absolute_path
  ANDROID: sdcardfs: remove effectless config option
  ANDROID: inotify: Fix erroneous update of bit count
  ANDROID: fs: sdcardfs: Declare LOOKUP_CASE_INSENSITIVE unconditionally
  ANDROID: trace: cpufreq: fix typo in min/max cpufreq
  ANDROID: sdcardfs: Add support for d_canonical_path
  ANDROID: vfs: add d_canonical_path for stacked filesystem support
  ANDROID: sdcardfs: Bring up to date with Android M permissions:
  ANDROID: Changed type-casting in packagelist management
  ANDROID: Port of sdcardfs to 4.4
  ANDROID: Included sdcardfs source code for kernel 3.0
  ANDROID: usb: gadget: Add support for MTP OS desc
  CHROMIUM: usb: gadget: f_accessory: add .raw_request callback
  CHROMIUM: usb: gadget: audio_source: add .free_func callback
  CHROMIUM: usb: gadget: f_mtp: fix usb_ss_ep_comp_descriptor
  CHROMIUM: usb: gadget: f_mtp: Add SuperSpeed support
  ANDROID: dm-crypt: run in a WQ_HIGHPRI workqueue
  ANDROID: power: Provide dummy log_suspend_abort_reason() if SUSPEND is disabled
  ANDROID: PM / suspend: Add dependency on RTC_LIB
  ANDROID: net: pppolac/pppopns: Replace msg.msg_iov with iov_iter_kvec()
  ANDROID: mmc: sdio: Disable retuning in sdio_reset_comm()
  ANDROID: kernel/watchdog: fix unused variable warning
  ANDROID: usb: gadget: f_mtp: don't use le16 for u8 field
  ANDROID: lowmemorykiller: fix declaration order warnings
  ANDROID: mmc: move to a SCHED_FIFO thread
  ANDROID: skip building drivers as modules
  ANDROID: wakeup: Add the guard condition for len in pm_get_active_wakeup_sources
  ANDROID: goldfish: pipe: fix platform_no_drv_owner.cocci warnings
  ANDROID: epoll: use freezable blocking call
  ANDROID: Fix for in kernel emergency remount when loop mounts are used
  ANDROID: kbuild: Makefile.clean: make Kbuild and Makefile optional
  ANDROID: kbuild: make it possible to specify the module output dir
  ANDROID: ext4: Add support for FIDTRIM, a best-effort ioctl for deep discard trim
  ANDROID: hardlockup: detect hard lockups without NMIs using secondary cpus
  ANDROID: rtc-palmas: correct for bcd year
  ANDROID: w1: ds2482: Manage SLPZ pin sleep state
  ANDROID: fuse: Freeze client on suspend when request sent to userspace
  ANDROID: serial_core: Add wake_peer uart operation
  ANDROID: mm: add a field to store names for private anonymous memory
  ANDROID: pstore/ram: Add ramoops_console_write_buf api
  ANDROID: pstore: Update Documentation/android.txt
  ANDROID: initramfs: Add skip_initramfs command line option
  ANDROID: of: Fix build warnings
  ANDROID: of: fix CONFIG_CMDLINE_EXTEND
  ANDROID: ARM64: copy CONFIG_CMDLINE_EXTEND from ARM
  ANDROID: of: Support CONFIG_CMDLINE_EXTEND config option
  ANDROID: ARM: decompressor: Flush tlb before swiching domain 0 to client mode
  ANDROID: ARM64: add option to build Image.gz/dtb combo
  ANDROID: ARM: convert build of appended dtb zImage to list of dtbs
  ANDROID: ARM: add config option to build zImage/dtb combo
  ANDROID: ARM: Fix dtb list when DTB_IMAGE_NAMES is empty
  ANDROID: arm64: pass return address to dma_common_contiguous_remap
  ANDROID: arch: arm64: force -fno-pic
  ANDROID: arm64: process: dump memory around registers when displaying regs
  ANDROID: arm64: check for upper PAGE_SHIFT bits in pfn_valid()
  ANDROID: ARM: fault: assume no context when IRQs are disabled during data abort.
  ANDROID: ARM: Fix "Make low-level printk work" to use a separate config option
  ANDROID: ARM: add option to flush console before reboot
  ANDROID: ARM: Make low-level printk work
  ANDROID: Optionally flush entire dcache from v6_dma_flush_range
  ANDROID: process: Add display of memory around registers when displaying regs.
  ANDROID: security: Add proper checks for Android specific capability checks
  ANDROID: uid_cputime: skip power reporting per uid for now
  ANDROID: uid_cputime: Check for the range while removing range of UIDs.
  ANDROID: uid_cputime: Iterates over all the threads instead of processes.
  ANDROID: uid_cputime: fix cputime overflow
  ANDROID: uid_cputime: Avoids double accounting of process stime, utime and cpu_power in task exit.
  ANDROID: uid_cputime: Extends the cputime functionality to report power per uid
  ANDROID: proc: uid_cputime: fix show_uid_stat permission
  ANDROID: proc: uid_cputime: create uids from kuids
  ANDROID: proc: uid: Adds accounting for the cputimes per uid.
  ANDROID: fixup! proc: make oom adjustment files user read-only
  ANDROID: proc: make oom adjustment files user read-only
  ANDROID: proc: smaps: Allow smaps access for CAP_SYS_RESOURCE
  ANDROID: wakeup_reason: use vsnprintf instead of snsprintf for vargs.
  ANDROID: power: wakeup_reason: fix suspend time reporting
  ANDROID: wakeup: Add last wake up source logging for suspend abort reason.
  ANDROID: Power: Report suspend times from last_suspend_time
  ANDROID: Make suspend abort reason logging depend on CONFIG_PM_SLEEP
  ANDROID: power: Add check_wakeup_reason() to verify wakeup source irq
  ANDROID: power: Adds functionality to log the last suspend abort reason.
  ANDROID: power: Avoids bogus error messages for the suspend aborts.
  ANDROID: power: Add property CHARGE_COUNTER_EXT and 64-bit precision properties
  ANDROID: Power: Changes the permission to read only for sysfs file /sys/kernel/wakeup_reasons/last_resume_reason
  ANDROID: power: wakeup_reason: rename irq_count to irqcount
  ANDROID: Power: Add guard condition for maximum wakeup reasons
  ANDROID: POWER: fix compile warnings in log_wakeup_reason
  ANDROID: Power: add an API to log wakeup reasons
  ANDROID: PM / Suspend: Print wall time at suspend entry and exit
  ANDROID: power: power_supply: add POWER_SUPPLY_PROP_CHARGE_ENABLED
  ANDROID: power: power_supply: add POWER_SUPPLY_PROP_USB_OTG
  ANDROID: power: power_supply: move POWER_SUPPLY_PROP_USB_HC to type 'int' order
  ANDROID: power_supply: Add custom property for USB High Current mode
  ANDROID: trace: power: add trace_clock_set_parent
  ANDROID: trace: cpufreq: Add tracing for min/max cpufreq
  ANDROID: trace: fix compilation for 4.1
  ANDROID: trace/events: fix gpu event timestamp formatting
  ANDROID: trace: add non-hierarchical function_graph option
  ANDROID: trace: Add an option to show tgids in trace output
  ANDROID: trace/events: add gpu trace events
  ANDROID: sync: add Documentation/sync.txt
  ANDROID: ARM: Call idle notifiers
  ANDROID: Move x86_64 idle notifiers to generic
  ANDROID: cpuidle: governor: menu: don't use loadavg
  ANDROID: sched: add sched blocked tracepoint which dumps out context of sleep.
  ANDROID: sched: Enable might_sleep before initializing drivers.
  ANDROID: fiq_debugger: Build fixes for 4.1
  ANDROID: fiq_debugger: Add fiq_watchdog_triggered api
  ANDROID: fiq_debugger: Call fiq_debugger_printf through a function pointer from cpu specific code
  ANDROID: fiq_debugger: add ARM64 support
  ANDROID: fiq_debugger: split arm support into fiq_debugger_arm.c
  ANDROID: fiq_debugger: use pt_regs for registers
  ANDROID: fiq_debugger: allow compiling without CONFIG_FIQ_GLUE
  ANDROID: fiq_debugger: rename debug->fiq_debugger
  ANDROID: fiq_debugger: move into drivers/staging/android/fiq_debugger/
  ANDROID: ARM: fiq_glue: Add custom fiq return handler api.
  ANDROID: ARM: kgdb: ignore breakpoint instructions from user mode
  ANDROID: ARM: fiq_debugger: Update tty code for 3.9
  ANDROID: ARM: fiq_debugger: Use kmsg_dumper to dump kernel logs
  ANDROID: ARM: fiq_debugger: Fix to compile on 3.7
  ANDROID: ARM: fiq_debugger: fix uninitialised spin_lock.
  ANDROID: ARM: fiq_debugger: lock between tty and console writes
  ANDROID: ARM: fiq_debugger: add process context reboot command
  ANDROID: ARM: fiq_debugger: fix multiple consoles and make it a preferred console
  ANDROID: kdb: support new lines without carriage returns
  ANDROID: ARM: fiq_debugger: add support for kgdb
  ANDROID: ARM: fiq_debugger: add debug_putc
  ANDROID: ARM: fiq_debugger: add support for reboot commands
  ANDROID: ARM: fiq_debugger: fix compiling for v3.3
  ANDROID: ARM: Add generic fiq serial debugger
  ANDROID: ARM: Add fiq_glue
  ANDROID: fix false disconnect due to a signal sent to the reading process
  ANDROID: usb: gadget: cleanup: fix unused variable and function warnings
  ANDROID: usb: gadget: build audio_source function only if SND is enabled
  ANDROID: usb: gadget: configfs: handle gadget reset request for android
  ANDROID: usb: gadget: create F_midi device
  ANDROID: usb: gadget: Add device attribute to determine gadget state
  ANDROID: usb: phy: fix dual role sysfs build if kernel modules are supported
  ANDROID: usb: phy: Dual role sysfs class definition
  ANDROID: usb: gadget: fix NULL ptr derefer while symlinking PTP func
  ANDROID: usb:gadget:Add "state" attribute to android_device
  ANDROID: usb: gadget: Do not disconnect unregistered dev
  ANDROID: usb: gadget: Relocate f_accessory
  ANDROID: usb: gadget: Accessory:Migrate to USB_FUNCTION API
  ANDROID: usb: gadget: Move gadget functions code
  ANDROID: usb:gadget:audio_source: Move to USB_FUNCTION API
  ANDROID: usb: gadget: Add function devices to the parent
  ANDROID: usb: gadget: f_audio_source:replace deprecated API
  ANDROID: usb: gadget: check for accessory device before disconnecting HIDs
  ANDROID: usb: gadget: Add Uevent to notify userspace
  ANDROID: usb: gadget: configfs: Add usb_function ptr to fi struct
  ANDROID: usb: gadget: mtp/ptp: Migrate functions to the USB_FUNCTION interface
  Linux 4.9.6
  libceph: stop allocating a new cipher on every crypto request
  libceph: uninline ceph_crypto_key_destroy()
  tools/virtio/ringtest: fix run-on-all.sh for offline cpus
  selftest/powerpc: Wrong PMC initialized in pmc56_overflow test
  soc: ti: wkup_m3_ipc: Fix error return code in wkup_m3_ipc_probe()
  spi: pxa2xx: add missed break
  dmaengine: pl330: Fix runtime PM support for terminated transfers
  dmaengine: rcar-dmac: unmap slave resource when channel is freed
  s5p-mfc: Fix clock management in s5p_mfc_release() function
  s5p-cec: mark PM functions as __maybe_unused again
  st-hva: fix some error handling in hva_hw_probe()
  ite-cir: initialize use_demodulator before using it
  gs1662: drop kfree for memory allocated with devm_kzalloc
  platform: pxa_camera: add VIDEO_V4L2 dependency
  blackfin: check devm_pinctrl_get() for errors
  rpmsg: virtio_rpmsg_bus: fix channel creation
  mtd: spi-nor: Fix some error codes in cqspi_setup_flash()
  mtd: spi-nor: Off by one in cqspi_setup_flash()
  PM / devfreq: Fix the bug of devfreq_add_device when governor is NULL
  PM / devfreq: exynos-bus: Fix the wrong return value
  scsi: mpt3sas: fix hang on ata passthrough commands
  scsi: ses: Fix SAS device detection in enclosure
  swiotlb: Add swiotlb=noforce debug option
  swiotlb: Convert swiotlb_force from int to enum
  arm64: Fix swiotlb fallback allocation
  arm64: mm: avoid name clash in __page_to_voff()
  xprtrdma: Squelch "max send, max recv" messages at connect time
  xprtrdma: Make FRWR send queue entry accounting more accurate
  libceph: make sure ceph_aes_crypt() IV is aligned
  ceph: fix endianness bug in frag_tree_split_cmp
  ceph: fix endianness of getattr mask in ceph_d_revalidate
  ceph: fix ceph_get_caps() interruption
  ceph: fix scheduler warning due to nested blocking
  ARM: 8613/1: Fix the uaccess crash on PB11MPCore
  ARM: ux500: fix prcmu_is_cpu_in_wfi() calculation
  ARM: dts: omap3: Fix Card Detect and Write Protect on Logic PD SOM-LV
  ARM: dts: imx6qdl-nitrogen6_max: fix sgtl5000 pinctrl init
  ARM: dts: omap2: Add an empty chosen node to top level DTSI
  ARM: dts: omap3: Add an empty chosen node to top level DTSI
  ARM: dts: am4372: Add an empty chosen node to top level DTSI
  ARM: dts: omap5: Add an empty chosen node to top level DTSI
  ARM: dts: omap4: Add an empty chosen node to top level DTSI
  ARM: dts: am33xx: Add an empty chosen node to top level DTSI
  ARM: dts: dm814x: Add an empty chosen node to top level DTSI
  ARM: dts: dm816x: Add an empty chosen node to top level DTSI
  ARM: dts: dra7: Add an empty chosen node to top level DTSI
  libceph: remove now unused ceph_*{en,de}crypt*() functions
  libceph: switch ceph_x_decrypt() to ceph_crypt()
  libceph: switch ceph_x_encrypt() to ceph_crypt()
  libceph: tweak calcu_signature() a little
  libceph: rename and align ceph_x_authorizer::reply_buf
  libceph: introduce ceph_crypt() for in-place en/decryption
  libceph: introduce ceph_x_encrypt_offset()
  libceph: old_key in process_one_ticket() is redundant
  libceph: ceph_x_encrypt_buflen() takes in_len
  Input: ALPS - fix TrackStick support for SS5 hardware
  arm64/ptrace: Reject attempts to set incomplete hardware breakpoint fields
  arm64/ptrace: Avoid uninitialised struct padding in fpr_set()
  arm64/ptrace: Preserve previous registers for short regset write - 3
  arm64/ptrace: Preserve previous registers for short regset write - 2
  arm64/ptrace: Preserve previous registers for short regset write
  arm64: avoid returning from bad_mode
  ARM: dts: da850-evm: fix read access to SPI flash
  ARM: dts: OMAP5 / DRA7: indicate that SATA port 0 is available.
  ceph: fix bad endianness handling in parse_reply_info_extra
  ibmvscsis: Fix max transfer length
  ibmvscsis: Fix sleeping in interrupt context
  ARM: 8634/1: hw_breakpoint: blacklist Scorpion CPUs
  svcrdma: avoid duplicate dma unmapping during error recovery
  clocksource/exynos_mct: Clear interrupt when cpu is shut down
  ubifs: Fix journal replay wrt. xattr nodes
  mac80211: implement multicast forwarding on fast-RX path
  qla2xxx: Fix crash due to null pointer access
  x86/ioapic: Restore IO-APIC irq_chip retrigger callback
  powerpc: Ignore reserved field in DCSR and PVR reads and writes
  powerpc/ptrace: Preserve previous TM fprs/vsrs on short regset write
  powerpc/ptrace: Preserve previous fprs/vsrs on short regset write
  powerpc/perf: Fix PM_BRU_CMPL event code for power9
  powerpc/icp-opal: Fix missing KVM case and harden replay
  KVM: arm/arm64: vgic: Fix deadlock on error handling
  KVM: s390: do not expose random data via facility bitmap
  mtd: nand: xway: fix build because of module functions
  mtd: nand: xway: disable module support
  mtd: nand: lpc32xx: fix invalid error handling of a requested irq
  ieee802154: atusb: do not use the stack for buffers to make them DMA able
  mmc: mxs-mmc: Fix additional cycles after transmission stop
  mmc: sdhci-acpi: Only powered up enabled acpi child devices
  HID: corsair: fix control-transfer error handling
  HID: corsair: fix DMA buffers on stack
  PCI: Enumerate switches below PCI-to-PCIe bridges
  PCI: designware: Check for iATU unroll only on platforms that use ATU
  fuse: fix time_to_jiffies nsec sanity check
  fuse: clear FR_PENDING flag when moving requests out of pending queue
  ARC: module: Fix !CONFIG_ARC_DW2_UNWIND builds
  libnvdimm, namespace: fix pmem namespace leak, delete when size set to zero
  svcrpc: don't leak contexts on PROC_DESTROY
  sunrpc: don't call sleeping functions from the notifier block callbacks
  rcu: Narrow early boot window of illegal synchronous grace periods
  rcu: Remove cond_resched() from Tiny synchronize_sched()
  x86/PCI: Ignore _CRS on Supermicro X8DTH-i/6/iF/6F
  tmpfs: clear S_ISGID when setting posix ACLs
  ARM: dts: omap3: Add DTS for Logic PD SOM-LV 37xx Dev Kit
  ARM: dts: imx31: fix AVIC base address
  ARM: dts: imx31: move CCM device node to AIPS2 bus devices
  ARM: dts: imx31: fix clock control module interrupts description
  ARM: dts: imx6q-cm-fx6: fix fec pinctrl
  ARM: dts: r8a7794: remove Z clock
  ARM: dts: r8a7794: Use SYSC "always-on" PM Domain for sound
  ARM: dts: bcm283x: fix typo in mailbox address
  perf jit: Enable jitdump support without dwarf
  perf scripting: Avoid leaking the scripting_context variable
  perf callchain: Fixup help/config for no-unwinding
  perf diff: Do not overwrite valid build id
  perf trace: Check if MAP_32BIT is defined (again)
  perf mem: Fix --all-user/--all-kernel options
  perf trace: Use the syscall raw_syscalls:sys_enter timestamp
  IB/IPoIB: Remove can't use GFP_NOIO warning
  IB/mlx4: Check if GRH is available before using it
  IB/mlx4: When no DMFS for IPoIB, don't allow NET_IF QPs
  IB/mlx4: Fix port query for 56Gb Ethernet links
  IB/mlx4: Handle well-known-gid in mad_demux processing
  IB/mlx4: Fix out-of-range array index in destroy qp flow
  IB/mlx4: Set traffic class in AH
  IB/mlx5: Wait for all async command completions to complete
  IB/mlx5: Assign SRQ type earlier
  IB/mlx5: Fix reported max SGE calculation
  IB/mlx5: Avoid system crash when enabling many VFs
  IB/rxe: avoid putting a large struct rxe_qp on stack
  IB/rxe: Increase max number of completions to 32k
  IB/core: Release allocated memory in cache setup failure
  Linux 4.9.5
  ANDROID: usb: gadget: f_audio_source: Fixed USB Audio Class Interface Descriptor
  ANDROID: usb: gadget: f_audio_source: change max ISO packet size
  ANDROID: usb: gadget: f_accessory: Enabled Zero Length Packet (ZLP) for acc_write
  ANDROID: drivers: usb: gadget: 64-bit related type fixes
  ANDROID: usb: gadget: f_accessory: move userspace interface to uapi
  ANDROID: usb: gadget: f_mtp: move userspace interface to uapi
  ANDROID: USB: remove duplicate out endpoint creation in MTP mode
  ANDROID: usb: gadget: Fix android gadget driver build
  ANDROID: usb: gadget: Fixes and hacks to make android usb gadget compile on 3.8
  ANDROID: usb: otg: otg-wakelock: Fix build for 3.7
  ANDROID: usb: gadget: accessory: Fix section mismatch (again)
  ANDROID: USB: gadget: f_audio_source: New gadget driver for audio output
  ANDROID: USB: gadget: f_accessory: Add support for HID input devices
  ANDROID: USB: gadget: Add ACCESSORY_SET_AUDIO_MODE control request and ioctl
  ANDROID: usb: gadget: accessory: Fix section mismatch
  ANDROID: usb: otg: otg-wakelock: Fix build for 3.4
  ANDROID: usb: gadget: adb: Only enable the gadget when adbd is ready
  ANDROID: usb: gadget: adb: do not set error flag when dequeuing req
  ANDROID: usb: gadget: adb: allow freezing in adb_read
  ANDROID: usb: gadget: accessory: Add Android Accessory function
  ANDROID: usb: gadget: adb: Add ADB function
  ANDROID: usb: gadget: mtp: Add MTP/PTP function
  ANDROID: usb: otg: otg-wakelock: fix build for 3.3
  ANDROID: usb: otg: Temporarily grab wakelock on charger and disconnect events
  ANDROID: USB: OTG: Take wakelock when VBUS present
  ANDROID: wlan: Add get_wake_irq functionality
  ANDROID: Add flags parameter to get_country_code template
  ANDROID: net: wireless: Add get_country_code functionality to platform
  ANDROID: network: wireless: Add get_mac_addr functionality to platform
  ANDROID: wlan: Create generic wlan platform data header
  ANDROID: net: wireless: Decrease scan entry expiration to avoid stall results
  ANDROID: bridge: Have tx_bytes count headers like rx_bytes.
  ANDROID: rfkill: Introduce CONFIG_RFKILL_PM and use instead of CONFIG_PM to power down
  ANDROID: net: ipv6: fix virtual tunneling build
  ANDROID: tcp: fix tcp_default_init_rwnd() for 4.1
  ANDROID: net: support marking accepting TCP sockets
  ANDROID: tcp: add a sysctl to config the tcp_default_init_rwnd
  ANDROID: xt_qtaguid: fix a race condition in if_tag_stat_update
  ANDROID: netfilter: xt_qtaguid/socket: build fixes for 4.4
  ANDROID: netfilter: xt_qtaguid: xt_socket: build fixes
  ANDROID: net: xt_qtaguid/xt_socket: fix refcount underflow and crash
  ANDROID: xt_qtaguid: use sock_gen_put() instead of xt_socket_put_sk()
  ANDROID: xt_qtaguid: Use sk_callback_lock read locks before reading sk->sk_socket
  ANDROID: xt_qtaguid: fix broken uid/gid range check
  ANDROID: netfilter: Build fixups - kuid/kguid changes & xt_socket_get/put_sk
  ANDROID: net: ipv6: autoconf routes into per-device tables
  ANDROID: nf: IDLETIMER: Fix broken uid field in the msg
  ANDROID: nf: IDLETIMER: Adds the uid field in the msg
  ANDROID: netfilter: fix seq_printf type mismatch warning
  ANDROID: nf: Remove compilation error caused by e8430cbed3ef15fdb1ac26cfd020e010aa5f1c35
  ANDROID: nf: IDLETIMER: time-stamp and suspend/resume handling.
  ANDROID: xt_qtaguid: Fix boot panic
  ANDROID: net: kuid/kguid build fixes
  ANDROID: netfilter: ipv6: fix crash caused by ipv6_find_hdr()
  ANDROID: netfilter: xt_qtaguid: 64-bit warning fixes
  ANDROID: netfilter: xt_qtaguid: fix memory leak in seq_file handlers
  ANDROID: netfilter: xt_qtaguid: fix bad tcp_time_wait sock handling
  ANDROID: netfilter: xt_qtaguid: 3.10 fixes
  ANDROID: netfilter: xt_quota2: 3.10 fixes.
  ANDROID: netfilter: qtaguid: rate limit some of the printks
  ANDROID: netfilter: xt_qtaguid: Allow tracking loopback
  ANDROID: netfilter: xt_qtaguid: extend iface stat to report protocols
  ANDROID: netfilter: xt_qtaguid: remove AID_* dependency for access control
  ANDROID: netfilter: qtaguid: Don't BUG_ON if create_if_tag_stat fails
  ANDROID: netfilter: xt_qtaguid: fix error exit that would keep a spinlock.
  ANDROID: netfilter: xt_qtaguid: report only uid tags to non-privileged processes
  ANDROID: netfilter: xt_qtaguid: start tracking iface rx/tx at low level
  ANDROID: netfilter: xt_IDLETIMER: Add new netlink msg type
  ANDROID: netfilter: xt_qtaguid: fix ipv6 protocol lookup
  ANDROID: netfilter: qtaguid: initialize a local var to keep compiler happy.
  ANDROID: netfilter: fixup the quota2, and enable.
  ANDROID: netfilter: adding the original quota2 from xtables-addons
  ANDROID: netfilter: add xt_qtaguid matching module
  ANDROID: net: PPPoPNS and PPPoLAC build fixes for 4.4
  ANDROID: Hack: net: PPPoPNS and PPPoLAC build fixes for 4.1
  ANDROID: net: pppopns: pppolac: fix sendmsg function calls
  ANDROID: net: PPPoPNS: Remove length argument from data_ready
  ANDROID: net: move PPPoLAC and PPPoPNS headers to uapi
  ANDROID: Include if_pppolac.h and if_pppopns.h into header-y target
  ANDROID: net: PPPoPNS and PPPoLAC update to use PPP_MRU instead of PPP_MRU
  ANDROID: net: Reorder incoming packets in PPPoLAC and PPPoPNS.
  ANDROID: net: PPPoPNS and PPPoLAC fixes.
  ANDROID: net: add PPP on PPTP Network Server (PPPoPNS) driver.
  ANDROID: net: add PPP on L2TP Access Concentrator (PPPoLAC) driver.
  ANDROID: sysfs_net_ipv4: Add sysfs-based knobs for controlling TCP window size
  ANDROID: net: Only NET_ADMIN is allowed to fully control TUN interfaces.
  ANDROID: net: Replace AID_NET_RAW checks with capable(CAP_NET_RAW).
  ANDROID: security: Add AID_NET_RAW and AID_NET_ADMIN capability check in cap_capable().
  ANDROID: Paranoid network.
  ANDROID: Add android_aid.h
  ANDROID: fs: block_dump: Don't display inode changes if block_dump < 2
  ANDROID: mmc: core: Remove stray CONFIG_EXPERIMENTAL dependencies
  ANDROID: mmc: Add "ignore mmc pm notify" functionality
  ANDROID: mmc: sdio: Fix sdio_reset_comm for sync
  ANDROID: mmc: sdio: fix sdio_reset_comm() voltage selection
  ANDROID: mmc: sdio: Fix enable_hs and enable_wide in sdio_reset_comm()
  ANDROID: mmc: sdio: Add high speed support to sdio_reset_comm()
  ANDROID: mmc: sdio: Claim host in sdio_reset_comm()
  ANDROID: mmc: Add new API call 'sdio_reset_comm' for resetting communication with an SDIO device
  ANDROID: mmc: Add sdio_readb_ext() function
  ANDROID: mmc: Add concept of an 'embedded' SDIO device.
  ANDROID: mmc: sd: Add retries in re-detection
  ANDROID: mmc: sd: When resuming, try a little harder to init the card
  ANDROID: mmc: sd: Add new CONFIG_MMC_PARANOID_SD_INIT for enabling retries during SD detection
  ANDROID: mtd: nand: Allow NAND chip ids to be included standalone.
  ANDROID: gpio_input: convert from wakelocks to wakeup sources
  ANDROID: input: Made keyreset more robust
  ANDROID: input: Changed keyreset to act as a wrapper for keycombo.
  ANDROID: input: add keycombo, a general key combo driver.
  ANDROID: input: misc: keychord: move header to uapi
  ANDROID: input: misc: keychord: log when keychord triggered
  ANDROID: input: keychord: Add keychord driver
  ANDROID: input: Add keyreset driver.
  ANDROID: input: misc: gpio_event: remove early suspend
  ANDROID: Input: Generic GPIO Input device.
  ANDROID: lowmemorykiller: use module_param_cb instead of __module_param_call
  ANDROID: lowmemorykiller: trace kill events.
  ANDROID: staging: lowmemorykiller: Add config option to support oom_adj values
  pinctrl: sh-pfc: Do not unconditionally support PIN_CONFIG_BIAS_DISABLE
  arm64: hugetlb: fix the wrong return value for huge_ptep_set_access_flags
  arm64: hugetlb: remove the wrong pmd check in find_num_contig()
  arm64: hugetlb: fix the wrong address for several functions
  powerpc/powernv: Don't warn on PE init if unfreeze is unsupported
  powerpc/ibmebus: Fix device reference leaks in sysfs interface
  powerpc/ibmebus: Fix further device reference leaks
  powerpc/mm: Correct process and partition table max size
  bus: vexpress-config: fix device reference leak
  blk-mq: Always schedule hctx->next_cpu
  power: supply: bq27xxx_battery: Fix register map for BQ27510 and BQ27520
  bq24190_charger: Fix PM runtime use for bq24190_battery_set_property
  iw_cxgb4: Fix error return code in c4iw_rdev_open()
  powercap/intel_rapl: fix and tidy up error handling
  ACPI / APEI: Fix NMI notification handling
  block: cfq_cpd_alloc() should use @gfp
  block: Change extern inline to static inline
  ACPI / CPPC: set an error code on probe error path
  regulators: helpers: Fix handling of bypass_val_on in get_bypass_regmap
  cpufreq: powernv: Disable preemption while checking CPU throttling state
  powerpc/64: Simplify adaptation to new ISA v3.00 HPTE format
  remoteproc: st: Fix error return code in st_rproc_probe()
  remoteproc: qcom_wcnss: Fix circular module dependency
  drm: Initialise drm_mm.head_node.allocated
  drm/i915: Move the min_pixclk[] handling to the end of readout
  drm/panel: simple: Check against num_timings when setting preferred for timing
  drm: avoid uninitialized timestamp use in wait_vblank
  drm/i915/gen9: Fix PCODE polling during SAGV disabling
  i2c: mux: pca954x: fix i2c mux selection caching
  NFSv4.1: nfs4_fl_prepare_ds must be careful about reporting success.
  NFS: Fix a performance regression in readdir
  pNFS: Fix race in pnfs_wait_on_layoutreturn
  NFS: fix typo in parameter description
  pinctrl: meson: fix gpio request disabling other modes
  btrfs: fix error handling when run_delayed_extent_op fails
  btrfs: fix locking when we put back a delayed ref that's too new
  nvme: apply DELAY_BEFORE_CHK_RDY quirk at probe time too
  x86/cpu: Fix bootup crashes by sanitizing the argument of the 'clearcpuid=' command-line option
  i2c: piix4: Avoid race conditions with IMC
  net/mlx5: Only cancel recovery work when cleaning up device
  USB: serial: ch341: fix modem-control and B0 handling
  drm/amdgpu: drop verde dpm quirks
  drm/amdgpu: update si kicker smc firmware
  drm/radeon: drop verde dpm quirks
  drm/radeon: update smc firmware selection for SI
  drm: Clean up planes in atomic commit helper failure path
  drm/i915/gen9: Fix PCODE polling timeout in stable backport
  net/af_iucv: don't use paged skbs for TX on HiperSockets
  sysctl: Drop reference added by grab_header in proc_sys_readdir
  Clearing FIFOs in RS485 emulation mode causes subsequent transmits to break
  extcon: return error code on failure
  sysrq: attach sysrq handler correctly for 32-bit kernel
  orinoco: Use shash instead of ahash for MIC calculations
  ibmvscsis: Fix srp_transfer_data fail return code
  tty/serial: atmel_serial: BUG: stop DMA from transmitting in stop_tx
  tty/serial: atmel: RS485 half duplex w/DMA: enable RX after TX is done
  virtio_blk: avoid DMA to stack for the sense buffer
  dmaengine: omap-dma: Fix dynamic lch_map allocation
  drivers: char: mem: Fix thinkos in kmem address checks
  mnt: Protect the mountpoint hashtable with mount_lock
  pid: fix lockdep deadlock warning due to ucount_lock
  vme: Fix wrong pointer utilization in ca91cx42_slave_get
  Revert "tty: serial: 8250: add CON_CONSDEV to flags"
  ASoC: hdmi-codec: use unsigned type to structure members with bit-field
  btrfs: fix crash when tracepoint arguments are freed by wq callbacks
  xhci: fix deadlock at host remove by running watchdog correctly
  fix a fencepost error in pipe_advance()
  i2c: fix kernel memory disclosure in dev interface
  i2c: print correct device invalid address
  Input: elants_i2c - avoid divide by 0 errors on bad touchscreen data
  USB: serial: ch341: fix open and resume after B0
  USB: serial: ch341: fix control-message error handling
  USB: serial: ch341: fix open error handling
  USB: serial: ch341: fix resume after reset
  USB: serial: ch341: fix initial modem-control state
  USB: serial: kl5kusb105: fix line-state error handling
  usb: musb: fix runtime PM in debugfs
  wusbcore: Fix one more crypto-on-the-stack bug
  x86/CPU/AMD: Fix Bulldozer topology
  x86/bugs: Separate AMD E400 erratum and C1E bug
  x86/cpu/AMD: Clean up cpu_llc_id assignment per topology feature
  bridge: netfilter: Fix dropping packets that moving through bridge interface
  xfs: Timely free truncated dirty pages
  gpio: Move freeing of GPIO hogs before numbing of the device
  nl80211: fix sched scan netlink socket owner destruction
  x86/efi: Don't allocate memmap through memblock after mm_init()
  efi/x86: Prune invalid memory map entries and fix boot regression
  efi/libstub/arm*: Pass latest memory map to the kernel
  KVM: x86: Introduce segmented_write_std
  KVM: x86: emulate FXSAVE and FXRSTOR
  KVM: x86: add asm_safe wrapper
  KVM: x86: add Align16 instruction flag
  KVM: x86: fix NULL deref in vcpu_scan_ioapic
  KVM: x86: flush pending lapic jump label updates on module unload
  jump_labels: API for flushing deferred jump label updates
  KVM: eventfd: fix NULL deref irqbypass consumer
  KVM: x86: fix emulation of "MOV SS, null selector"
  mm/hugetlb.c: fix reservation race when freeing surplus pages
  mm/slab.c: fix SLAB freelist randomization duplicate entries
  mm: support anonymous stable page
  mm, memcg: fix the active list aging for lowmem requests when memcg is enabled
  ocfs2: fix crash caused by stale lvb with fsdlm plugin
  mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
  mm: pmd dirty emulation in page fault handler
  dax: fix deadlock with DAX 4k holes
  zram: support BDI_CAP_STABLE_WRITES
  zram: revalidate disk under init_lock
  selftests: do not require bash for the generated test
  selftests: do not require bash to run netsocktests testcase
  drm/savage: dereferencing an error pointer
  drm/vc4: Fix a couple error codes in vc4_cl_lookup_bos()
  drm/tegra: dpaux: Fix error handling
  regulator: axp20x: Fix axp809 ldo_io registration error on cold boot
  regulator: tps65086: Fix 25mV ranges for BUCK regulators
  pinctrl: sh-pfc: Add helper to handle bias lookup table
  pinctrl: sh-pfc: r8a7795: Use lookup function for bias data
  pinctrl: imx: fix imx_pinctrl_desc initialization
  Input: i8042 - add Pegatron touchpad to noloop table
  Input: xpad - use correct product id for x360w controllers
  Linux 4.9.4
  rtlwifi: rtl_usb: Fix missing entry in USB driver's private data
  rtlwifi: Fix enter/exit power_save
  drm/i915/gen9: Fix PCODE polling during CDCLK change notification
  ALSA: usb-audio: Add a quirk for Plantronics BT600
  spi: mvebu: fix baudrate calculation for armada variant
  ARM: omap2+: am437x: rollback to use omap3_gptimer_timer_init()
  ARM: 8631/1: clkdev: Detect errors in clk_hw_register_clkdev() for mass registration
  ARM: OMAP4+: Fix bad fallthrough for cpuidle
  ARM: OMAP5: Fix build for PM code
  ARM: OMAP5: Fix mpuss_early_init
  bus: arm-ccn: Prevent hotplug callback leak
  svcrdma: Clear xpt_bc_xps in xprt_setup_rdma_bc() error exit arm
  ARM: qcom_defconfig: Fix MDM9515 LCC and GCC config
  ARM: zynq: Reserve correct amount of non-DMA RAM
  ARM: pxa: fix pxa25x interrupt init
  ARM64: dts: bcm2835: Fix bcm2837 compatible string
  ARM64: dts: bcm2837-rpi-3-b: remove incorrect pwr LED
  arm64: dts: mt8173: Fix auxadc node
  tools/virtio: fix READ_ONCE()
  powerpc: Fix build warning on 32-bit PPC
  ALSA: firewire-tascam: Fix to handle error from initialization of stream data
  HID: hid-cypress: validate length of report
  net: vrf: do not allow table id 0
  net: ipv4: Fix multipath selection with vrf
  net/mlx5e: Remove WARN_ONCE from adaptive moderation code
  gro: Disable frag0 optimization on IPv6 ext headers
  gro: use min_t() in skb_gro_reset_offset()
  gro: Enter slow-path if there is no tailroom
  net: add the AF_QIPCRTR entries to family name tables
  net: dsa: Ensure validity of dst->ds[0]
  r8152: fix rx issue for runtime suspend
  r8152: split rtl8152_suspend function
  net: dsa: bcm_sf2: Utilize nested MDIO read/write
  net: dsa: bcm_sf2: Do not clobber b53_switch_ops
  bpf: change back to orig prog on too many passes
  net: vrf: Add missing Rx counters
  ipv4: Do not allow MAIN to be alias for new LOCAL w/ custom rules
  igmp: Make igmp group member RFC 3376 compliant
  flow_dissector: Update pptp handling to avoid null pointer deref.
  drop_monitor: consider inserted data in genlmsg_end
  drop_monitor: add missing call to genlmsg_end
  net: ipv4: dst for local input routes should use l3mdev if relevant
  net: fix incorrect original ingress device index in PKTINFO
  rtnl: stats - add missing netlink message size checks
  net/mlx5e: Disable netdev after close
  net/mlx5e: Don't sync netdev state when not registered
  net/mlx5: Prevent setting multicast macs for VFs
  net/mlx5: Mask destination mac value in ethtool steering rules
  net/mlx5: Avoid shadowing numa_node
  net/mlx5: Cancel recovery work in remove flow
  net/mlx5: Check FW limitations on log_max_qp before setting it
  net/sched: cls_flower: Fix missing addr_type in classify
  net: stmmac: Fix race between stmmac_drv_probe and stmmac_open
  net, sched: fix soft lockup in tc_classify
  ipv6: handle -EFAULT from skb_copy_bits
  inet: fix IP(V6)_RECVORIGDSTADDR for udp sockets
  sctp: sctp_transport_lookup_process should rcu_read_unlock when transport is null
  net: vrf: Drop conntrack data after pass through VRF device on Tx
  net: vrf: Fix NAT within a VRF
  Linux 4.9.3
  usb: gadget: composite: always set ep->mult to a sensible value
  Revert "usb: gadget: composite: always set ep->mult to a sensible value"
  Revert "rtlwifi: Fix enter/exit power_save"
  tick/broadcast: Prevent NULL pointer dereference
  clocksource/dummy_timer: Move hotplug callback after the real timers
  xfs: fix max_retries _show and _store functions
  xfs: fix crash and data corruption due to removal of busy COW extents
  xfs: use the actual AG length when reserving blocks
  xfs: fix double-cleanup when CUI recovery fails
  xfs: use GPF_NOFS when allocating btree cursors
  xfs: ignore leaf attr ichdr.count in verifier during log replay
  xfs: don't cap maximum dedupe request length
  xfs: don't allow di_size with high bit set
  xfs: error out if trying to add attrs and anextents > 0
  xfs: don't crash if reading a directory results in an unexpected hole
  xfs: complain if we don't get nextents bmap records
  xfs: check for bogus values in btree block headers
  xfs: forbid AG btrees with level == 0
  xfs: handle cow fork in xfs_bmap_trace_exlist
  xfs: pass state not whichfork to trace_xfs_extlist
  xfs: Move AGI buffer type setting to xfs_read_agi
  xfs: pass post-eof speculative prealloc blocks to bmapi
  xfs: use new extent lookup helpers xfs_file_iomap_begin_delay
  xfs: clean up cow fork reservation and tag inodes correctly
  xfs: use new extent lookup helpers in __xfs_reflink_reserve_cow
  xfs: track preallocation separately in xfs_bmapi_reserve_delalloc()
  xfs: remove prev argument to xfs_bmapi_reserve_delalloc
  xfs: always succeed when deduping zero bytes
  xfs: factor rmap btree size into the indlen calculations
  xfs: new inode extent list lookup helpers
  xfs: fix unbalanced inode reclaim flush locking
  xfs: check minimum block size for CRC filesystems
  xfs: provide helper for counting extents from if_bytes
  xfs: don't BUG() on mixed direct and mapped I/O
  xfs: don't skip cow forks w/ delalloc blocks in cowblocks scan
  xfs: check return value of _trans_reserve_quota_nblks
  xfs: don't call xfs_sb_quota_from_disk twice
  tpm_tis: Check return values from get_burstcount.
  drm/i915/gen9: fix the WM memory bandwidth WA for Y tiling cases
  drm/i915/gen9: unconditionally apply the memory bandwidth WA
  drm/i915: disable PSR by default on HSW/BDW
  drm/radeon: Always store CRTC relative radeon_crtc->cursor_x/y values
  s390/pci: fix dma address calculation in map_sg
  s390/topology: always use s390 specific sched_domain_topology_level
  powerpc/pci/rpadlpar: Fix device reference leaks
  PCI: Enable access to non-standard VPD for Chelsio devices (cxgb3)
  PCI: Support INTx masking on ConnectX-4 with firmware x.14.1100+
  PCI: Convert Mellanox broken INTx quirks to be for listed devices only
  PCI: Convert broken INTx masking quirks from HEADER to FINAL
  PCI: Add Mellanox device IDs
  PCI: rockchip: Correct the use of FTS mask
  PCI: rockchip: Fix negotiated lanes calculation
  staging: media: davinci_vpfe: unlock on error in vpfe_reqbufs()
  f2fs: hide a maybe-uninitialized warning
  f2fs: remove percpu_count due to performance regression
  md: fix refcount problem on mddev when stopping array.
  md: MD_RECOVERY_NEEDED is set for mddev->recovery
  crypto: arm64/aes-ce - fix for big endian
  crypto: arm64/aes-xts-ce: fix for big endian
  crypto: arm64/sha1-ce - fix for big endian
  crypto: arm64/aes-neon - fix for big endian
  crypto: arm64/aes-ccm-ce: fix for big endian
  crypto: arm/aes-ce - fix for big endian
  crypto: arm64/ghash-ce - fix for big endian
  crypto: arm64/sha2-ce - fix for big endian
  s390/crypto: unlock on error in prng_tdes_read()
  mm, compaction: fix NR_ISOLATED_* stats for pfn based migration
  mm: khugepaged: fix radix tree node leak in shmem collapse error path
  mm: khugepaged: close use-after-free race during shmem collapsing
  docs-rst: fix LaTeX \DURole renewcommand with Sphinx 1.3+
  mm/hugetlb.c: use the right pte val for compare in hugetlb_cow
  rpmsg: qcom_smd: Correct return value for O_NONBLOCK
  mmc: mmc_test: Uninitialized return value
  genirq/affinity: Fix node generation from cpumask
  PM / wakeirq: Fix dedicated wakeirq for drivers not using autosuspend
  irqchip/bcm7038-l1: Implement irq_cpu_offline() callback
  PCI/MSI: Check for NULL affinity mask in pci_irq_get_affinity()
  ima: fix memory leak in ima_release_policy
  relay: check array offset before using it
  sbp-target: Fix second argument of percpu_ida_alloc()
  target/iscsi: Fix double free in lio_target_tiqn_addtpg()
  scsi: mvsas: fix command_active typo
  scsi: g_NCR5380: Fix release_region in error handling
  ASoC: samsung: i2s: Fixup last IRQ unsafe spin lock call
  ASoC: Intel: Skylake: Fix a shift wrapping bug
  ASoC: cht_bsw_rt5645: Fix leftover kmalloc
  ASoC: lpass-platform: initialize dma channel number
  iommu/vt-d: Flush old iommu caches for kdump when the device gets context mapped
  iommu/vt-d: Fix pasid table size encoding
  iommu/amd: Fix the left value check of cmd buffer
  iommu/amd: Missing error code in amd_iommu_init_device()
  clk: renesas: mstp: Support 8-bit registers for r7s72100
  clk: imx31: fix rewritten input argument of mx31_clocks_init()
  clk: sunxi-ng: sun8i-h3: Set CLK_SET_RATE_PARENT for audio module clocks
  clk: sunxi-ng: sun8i-a23: Set CLK_SET_RATE_PARENT for audio module clocks
  clk: ti: dra7: fix "failed to lookup clock node gmac_gmii_ref_clk_div" boot message
  clk: clk-wm831x: fix a logic error
  clk: qcom: ipq806x: Fix board clk rates
  Input: synaptics-rmi4 - unlock on error
  hwmon: (lm90) fix temp1_max_alarm attribute
  hwmon: (g762) Fix overflows and crash seen when writing limit attributes
  hwmon: (nct7802) Fix overflows seen when writing into limit attributes
  hwmon: (ds620) Fix overflows seen when writing temperature limits
  hwmon: (amc6821) sign extension temperature
  hwmon: (scpi) Fix module autoload
  platform/x86: fujitsu-laptop: use brightness_set_blocking for LED-setting callbacks
  x86/cpu: Probe CPUID leaf 6 even when cpuid_level == 6
  x86/prctl/uapi: Remove #ifdef for CHECKPOINT_RESTORE
  debugfs: improve DEFINE_DEBUGFS_ATTRIBUTE for !CONFIG_DEBUG_FS
  clk: renesas: cpg-mssr: Fix inverted debug check
  efi/efivar_ssdt_load: Don't return success on allocation failure
  cris: Only build flash rescue image if CONFIG_ETRAX_AXISFLASHMAP is selected
  ath10k: use the right length of "background"
  mfd: tps65217: Fix page fault on unloading modules
  ath10k: fix failure to send NULL func frame for 10.4
  nl80211: Use different attrs for BSSID and random MAC addr in scan req
  mac80211: fix tid_agg_rx NULL dereference
  drm/i915: tune down the fast link training vs boot fail
  drm/i915/dp: add lane_count check in intel_dp_check_link_status
  usb: dwc3: gadget: always unmap EP0 requests
  usb: dwc3: ep0: explicitly call dwc3_ep0_prepare_one_trb()
  usb: dwc3: ep0: add dwc3_ep0_prepare_one_trb()
  iio: accel: st_accel: fix LIS3LV02 reading and scaling
  staging: iio: ad7606: fix improper setting of oversampling pins
  mei: move write cb to completion on credentials failures
  mei: bus: fix mei_cldev_enable KDoc
  mei: fix parameter rename KDoc
  USB: serial: io_ti: bind to interface after fw download
  dibusb: fix possible memory leak in dibusb_rc_query()
  ARM: dts: sun7i: bananapi-m1-plus: Enable USB PHY for USB host support
  arm64: dts: hip06: Correct hardware pin number of usb node
  USB: phy: am335x-control: fix device and of_node leaks
  ARM: dts: r8a7794: Correct hsusb parent clock
  usb: gadget: fix request length error for isoc transfer
  usb: gadget: Fix second argument of percpu_ida_alloc()
  USB: serial: kl5kusb105: abort on open exception path
  ALSA: usb-audio: Fix bogus error return in snd_usb_create_stream()
  usb: musb: blackfin: add bfin_fifo_offset in bfin_ops
  usb: gadget: udc: core: fix return code of usb_gadget_probe_driver()
  usb: hub: Move hub_port_disable() to fix warning if PM is disabled
  usb: musb: Fix trying to free already-free IRQ 4
  usb: dwc3: gadget: Fix full speed mode
  usb: dwc3: pci: Fix dr_mode misspelling
  usb: dwc3: pci: add Intel Gemini Lake PCI ID
  xhci: Fix race related to abort operation
  xhci: Use delayed_work instead of timer for command timeout
  usb: xhci-mem: use passed in GFP flags instead of GFP_KERNEL
  USB: serial: mos7720: fix parallel probe
  USB: serial: mos7720: fix parport use-after-free on probe errors
  USB: serial: mos7720: fix use-after-free on probe errors
  USB: serial: mos7720: fix NULL-deref at open
  USB: serial: mos7840: fix NULL-deref at open
  USB: serial: kobil_sct: fix NULL-deref in write
  USB: serial: cyberjack: fix NULL-deref at open
  USB: serial: oti6858: fix NULL-deref at open
  USB: serial: io_edgeport: fix NULL-deref at open
  USB: serial: ti_usb_3410_5052: fix NULL-deref at open
  USB: serial: garmin_gps: fix memory leak on failed URB submit
  USB: serial: iuu_phoenix: fix NULL-deref at open
  USB: serial: io_ti: fix I/O after disconnect
  USB: serial: io_ti: fix another NULL-deref at open
  USB: serial: io_ti: fix NULL-deref at open
  USB: serial: spcp8x5: fix NULL-deref at open
  USB: serial: keyspan_pda: verify endpoints at probe
  USB: serial: pl2303: fix NULL-deref at open
  USB: serial: quatech2: fix sleep-while-atomic in close
  USB: serial: omninet: fix NULL-derefs at open and disconnect
  usb: return error code when platform_get_irq fails
  usb: xhci: hold lock over xhci_abort_cmd_ring()
  xhci: Handle command completion and timeout race
  usb: host: xhci: Fix possible wild pointer when handling abort command
  usb: xhci: fix return value of xhci_setup_device()
  xhci: free xhci virtual devices with leaf nodes first
  usb: xhci: apply XHCI_PME_STUCK_QUIRK to Intel Apollo Lake
  usb: xhci: fix possible wild pointer
  usb: dwc3: core: avoid Overflow events
  usb: gadget: composite: Test get_alt() presence instead of set_alt()
  USB: dummy-hcd: fix bug in stop_activity (handle ep0)
  USB: fix problems with duplicate endpoint addresses
  USB: gadgetfs: fix checks of wTotalLength in config descriptors
  USB: gadgetfs: fix use-after-free bug
  USB: gadgetfs: fix unbounded memory allocation bug
  usb: gadgetfs: restrict upper bound on device configuration size
  usb: storage: unusual_uas: Add JMicron JMS56x to unusual device
  usb: musb: dsps: implement clear_ep_rxintr() callback
  usb: musb: core: add clear_ep_rxintr() to musb_platform_ops
  KVM: MIPS: Flush KVM entry code from icache globally
  KVM: MIPS: Don't clobber CP0_Status.UX
  KVM: x86: reset MMU on KVM_SET_VCPU_EVENTS
  drm/i915: Initialize overlay->last_flip properly
  drm/i915: Force VDD off on the new power seqeuencer before starting to use it
  drm/i915: Fix oops in overlay due to frontbuffer tracking
  drm/i915: Fix oopses in the overlay code due to i915_gem_active stuff
  gcc-plugins: update gcc-common.h for gcc-7
  asm-prototypes: Clear any CPP defines before declaring the functions
  mac80211: initialize fast-xmit 'info' later
  pinctrl/amd: Set the level based on ACPI tables
  ARM: davinci: da850: don't add emac clock to lookup table twice
  HID: sensor-hub: Move the memset to sensor_hub_get_feature()
  parisc: Mark cr16 clocksource unstable on SMP systems
  parisc: Add line-break when printing segfault info
  fscrypt: fix renaming and linking special files
  ALSA: usb-audio: Fix irq/process data synchronization
  ALSA: hda - Apply asus-mode8 fixup to ASUS X71SL
  ALSA: hda - Fix up GPIO for ASUS ROG Ranger
  staging: octeon: Call SET_NETDEV_DEV()
  iio: bmi160: Fix time needed to sleep after command execution
  iio: max44000: correct value in illuminance_integration_time_available
  iio: common: st_sensors: fix channel data parsing
  Linux 4.9.2
  drm/i915: Fix setting of boost freq tunable
  drm/i915: skip the first 4k of stolen memory on everything >= gen8
  drm/i915: Initialize dev_priv->atomic_cdclk_freq at init time
  drm/i915: Fix cdclk vs. dev_cdclk mess when not recomputing things
  drm/i915/dsi: Do not clear DPOUNIT_CLOCK_GATE_DISABLE from vlv_init_display_clock_gating
  drm/i915/dsi: Fix chv_exec_gpio disabling the GPIOs it is setting
  net: mvpp2: fix dma unmapping of TX buffers for fragments
  sg_write()/bsg_write() is not fit to be called under KERNEL_DS
  kconfig/nconf: Fix hang when editing symbol with a long prompt
  target/user: Fix use-after-free of tcmu_cmds if they are expired
  libnvdimm, pfn: fix align attribute
  of, numa: Return NUMA_NO_NODE from disable of_node_to_nid() if nid not possible.
  powerpc/boot: Request no dynamic linker for boot wrapper
  powerpc/ps3: Fix system hang with GCC 5 builds
  powerpc/64e: Convert cmpi to cmpwi in head_64.S
  SUNRPC: fix refcounting problems with auth_gss messages.
  pNFS: Fix a deadlock between read resends and layoutreturn
  pNFS: Clear NFS_LAYOUT_RETURN_REQUESTED when invalidating the layout stateid
  pNFS: Don't clear the layout stateid if a layout return is outstanding
  pNFS: On error, do not send LAYOUTGET until the LAYOUTRETURN has completed
  nfs_write_end(): fix handling of short copies
  libceph: verify authorize reply on connect
  PCI: Check for PME in targeted sleep state
  i40iw: Use correct src address in memcpy to rdma stats counters
  bad_inode: add missing i_op initializers
  Input: drv260x - fix input device's parent assignment
  v4l: tvp5150: Add missing break in set control handler
  media: solo6x10: fix lockup by avoiding delayed register write
  s5p-mfc: fix failure path of s5p_mfc_alloc_memdev()
  mn88473: fix chip id check on probe
  mn88472: fix chip id check on probe
  IB/cma: Fix a race condition in iboe_addr_get_sgid()
  IB/rxe: Fix a memory leak in rxe_qp_cleanup()
  IB/multicast: Check ib_find_pkey() return value
  IPoIB: Avoid reading an uninitialized member variable
  IB/mad: Fix an array index check
  fgraph: Handle a case where a tracer ignores set_graph_notrace
  x86/smpboot: Make logical package management more robust
  platform/x86: asus-nb-wmi.c: Add X45U quirk
  ftrace/x86_32: Set ftrace_stub to weak to prevent gcc from using short jumps to it
  vsock/virtio: fix src/dst cid format
  fsnotify: Fix possible use-after-free in inode iteration on umount
  kvm: nVMX: Allow L1 to intercept software exceptions (#BP and #OF)
  KVM: PPC: Book3S HV: Don't lose hardware R/C bit updates in H_PROTECT
  KVM: PPC: Book3S HV: Save/restore XER in checkpointed register state
  scsi: aacraid: remove wildcard for series 9 controllers
  md/raid5: limit request size according to implementation limits
  sc16is7xx: Drop bogus use of IRQF_ONESHOT
  latent_entropy: fix ARM build error on earlier gcc
  arm64: KVM: pmu: Reset PMSELR_EL0.SEL to a sane value before entering the guest
  s390/kexec: use node 0 when re-adding crash kernel memory
  s390/vmlogrdr: fix IUCV buffer allocation
  firmware: fix usermode helper fallback loading
  ARC: mm: arc700: Don't assume 2 colours for aliasing VIPT dcache
  scsi: avoid a permanent stop of the scsi device's request queue
  scsi: zfcp: fix rport unblock race with LUN recovery
  scsi: zfcp: do not trace pure benign residual HBA responses at default level
  scsi: zfcp: fix use-after-"free" in FC ingress path after TMF
  iscsi-target: Return error if unable to add network portal
  scsi: megaraid_sas: Do not set MPI2_TYPE_CUDA for JBOD FP path for FW which does not support JBOD sequence map
  scsi: megaraid_sas: For SRIOV enabled firmware, ensure VF driver waits for 30secs before reset
  stm class: Fix device leak in open error path
  vt: fix Scroll Lock LED trigger name
  block: protect iterate_bdevs() against concurrent close
  mei: me: add lewisburg device ids
  mei: request async autosuspend at the end of enumeration
  drivers/gpu/drm/ast: Fix infinite loop if read fails
  drm/amdgpu: fix init save/restore list in gfx_v8.0
  drm/amdgpu: fix enable_cp_power_gating in gfx_v8.0.
  drm/amd/powerplay: bypass fan table setup if no fan connected
  drm/gma500: Add compat ioctl
  drm/radeon/si: load the proper firmware on 0x87 oland boards
  drm/radeon: add additional pci revision to dpm workaround
  drm/radeon: Hide the HW cursor while it's out of bounds
  drm/radeon: Also call cursor_move_locked when the cursor size changes
  drm/nouveau/fifo/gf100-: protect channel preempt with subdev mutex
  drm/nouveau/i2c/gk110b,gm10x: use the correct implementation
  drm/nouveau/ttm: wait for bo fence to signal before unmapping vmas
  drm/nouveau/ltc: protect clearing of comptags with mutex
  drm/nouveau/bios: require checksum to match for fast acpi shadow method
  drm/nouveau/kms: lvds panel strap moved again on maxwell
  drm/nouveau/gr: fallback to legacy paths during firmware lookup
  drm/amd/amdgpu: enable GUI idle INT after enabling CGCG
  drm/amdgpu: Also call cursor_move_locked when the cursor size changes
  drm/amdgpu: Store CRTC relative amdgpu_crtc->cursor_x/y values
  drm/amdgpu: add additional pci revision to dpm workaround
  drm/amdgpu/si: load the proper firmware on 0x87 oland boards
  ACPI / video: Add force_native quirk for HP Pavilion dv6
  ACPI / video: Add force_native quirk for Dell XPS 17 L702X
  staging: comedi: ni_mio_common: fix E series ni_ai_insn_read() data
  staging: comedi: ni_mio_common: fix M Series ni_ai_insn_read() data mask
  staging: lustre: ldlm: pl_recalc time handling is wrong
  staging/lustre/osc: Revert erroneous list_for_each_entry_safe use
  hv: acquire vmbus_connection.channel_mutex in vmbus_free_channels()
  docs: sphinx-extensions: make rstFlatTable work with docutils 0.13
  thermal: hwmon: Properly report critical temperature in sysfs
  clk: bcm2835: Avoid overwriting the div info when disabling a pll_div clk
  arm64: tegra: Add VDD_GPU regulator to Jetson TX1
  gpio: chardev: Return error for seek operations
  gpio: stmpe: fix interrupt handling bug
  timekeeping_Force_unsigned_clocksource_to_nanoseconds_conversion
  mmc: sd: Meet alignment requirements for raw_ssr DMA
  regulator: stw481x-vmmc: fix ages old enable error
  mmc: sdhci: Fix recovery from tuning timeout
  Revert "mmc: sdhci: Reset cmd and data circuits after tuning failure"
  ath9k: do not return early to fix rcu unlocking
  ath9k: Really fix LED polarity for some Mini PCI AR9220 MB92 cards.
  ath9k: fix ath9k_hw_gpio_get() to return 0 or 1 on success
  cfg80211/mac80211: fix BSS leaks when abandoning assoc attempts
  rtl8xxxu: Work around issue with 8192eu and 8723bu devices not reconnecting
  perf/x86/intel/cstate: Prevent hotplug callback leak
  perf annotate: Don't throw error for zero length symbols
  perf/x86: Fix exclusion of BTS and LBR for Goldmont
  rtlwifi: Fix enter/exit power_save
  ath10k: fix soft lockup during firmware crash/hw-restart
  ssb: Fix error routine when fallback SPROM fails
  Linux 4.9.1
  x86/kbuild: enable modversions for symbols exported from asm
  builddeb: fix cross-building to arm64 producing host-arch debs
  xfs: set AGI buffer type in xlog_recover_clear_agi_bucket
  xfs: fix up xfs_swap_extent_forks inline extent handling
  arm/xen: Use alloc_percpu rather than __alloc_percpu
  xen/gntdev: Use VM_MIXEDMAP instead of VM_IO to avoid NUMA balancing
  tpm xen: Remove bogus tpm_chip_unregister
  kernel/debug/debug_core.c: more properly delay for secondary CPUs
  watchdog: qcom: fix kernel panic due to external abort on non-linefetch
  watchdog: mei_wdt: request stop on reboot to prevent false positive event
  kernel/watchdog: use nmi registers snapshot in hardlockup handler
  CIFS: Fix a possible memory corruption in push locks
  CIFS: Decrease verbosity of ioctl call
  CIFS: Fix a possible double locking of mutex during reconnect
  CIFS: Fix missing nls unload in smb2_reconnect()
  CIFS: Fix a possible memory corruption during reconnect
  cifs: Fix smbencrypt() to stop pointing a scatterlist at the stack
  ASoC: intel: Fix crash at suspend/resume without card registration
  dm space map metadata: fix 'struct sm_metadata' leak on failed create
  dm raid: fix discard support regression
  dm rq: fix a race condition in rq_completed()
  dm crypt: mark key as invalid until properly loaded
  dm flakey: return -EINVAL on interval bounds error in flakey_ctr()
  dm table: an 'all_blk_mq' table must be loaded for a blk-mq DM device
  dm table: fix 'all_blk_mq' inconsistency when an empty table is loaded
  blk-mq: Do not invoke .queue_rq() for a stopped queue
  PM / OPP: Don't use OPP structure outside of rcu protected section
  PM / OPP: Pass opp_table to dev_pm_opp_put_regulator()
  usb: gadget: composite: always set ep->mult to a sensible value
  mm, page_alloc: keep pcp count and list contents in sync if struct page is corrupted
  mm/vmscan.c: set correct defer count for shrinker
  nvmet: Fix possible infinite loop triggered on hot namespace removal
  loop: return proper error from loop_queue_rq()
  f2fs: fix to determine start_cp_addr by sbi->cur_cp_pack
  f2fs: fix overflow due to condition check order
  f2fs: set ->owner for debugfs status file's file_operations
  Revert "f2fs: use percpu_counter for # of dirty pages in inode"
  ext4: do not perform data journaling when data is encrypted
  ext4: return -ENOMEM instead of success
  ext4: reject inodes with negative size
  ext4: add sanity checking to count_overhead()
  ext4: fix in-superblock mount options processing
  ext4: use more strict checks for inodes_per_block on mount
  ext4: fix stack memory corruption with 64k block size
  ext4: fix mballoc breakage with 64k block size
  ext4: don't lock buffer in ext4_commit_super if holding spinlock
  crypto: caam - fix AEAD givenc descriptors
  ptrace: Don't allow accessing an undumpable mm
  ptrace: Capture the ptracer's creds not PT_PTRACE_CAP
  vfs,mm: fix return value of read() at s_maxbytes
  mm: Add a user_ns owner to mm_struct and fix ptrace permission checks
  block_dev: don't test bdev->bd_contains when it is not stable
  splice: reinstate SIGPIPE/EPIPE handling
  fs: exec: apply CLOEXEC before changing dumpable task flags
  exec: Ensure mm->user_ns contains the execed files
  clk: ti: omap36xx: Work around sprz319 advisory 2.1
  ALSA: hda: when comparing pin configurations, ignore assoc in addition to seq
  ALSA: hda - Gate the mic jack on HP Z1 Gen3 AiO
  ALSA: hda - fix headset-mic problem on a Dell laptop
  ALSA: hda - ignore the assoc and seq when comparing pin configurations
  ALSA: hda/ca0132 - Add quirk for Alienware 15 R2 2016
  ALSA: hiface: Fix M2Tech hiFace driver sampling rate change
  ALSA: usb-audio: Add QuickCam Communicate Deluxe/S7500 to volume_control_quirks
  usbip: vudc: fix: Clear already_seen flag also for ep0
  USB: UHCI: report non-PME wakeup signalling for Intel hardware
  usb: gadget: composite: correctly initialize ep->maxpacket
  usb: gadget: f_uac2: fix error handling at afunc_bind
  usb: core: usbport: Use proper LED API to fix potential crash
  usb: hub: Fix auto-remount of safely removed or ejected USB-3 devices
  usb: dwc3: gadget: set PCM1 field of isochronous-first TRBs
  USB: cdc-acm: add device id for GW Instek AFG-125
  USB: serial: kl5kusb105: fix open error path
  USB: serial: option: add dlink dwm-158
  USB: serial: option: add support for Telit LE922A PIDs 0x1040, 0x1041
  Btrfs: fix qgroup rescan worker initialization
  Btrfs: fix emptiness check for dirtied extent buffers at check_leaf()
  btrfs: store and load values of stripes_min/stripes_max in balance status item
  Btrfs: fix relocation incorrectly dropping data references
  Btrfs: fix tree search logic when replaying directory entry deletes
  Btrfs: fix deadlock caused by fsync when logging directory entries
  Btrfs: fix BUG_ON in btrfs_mark_buffer_dirty
  btrfs: limit async_work allocation and worker func duration
  hotplug: Make register and unregister notifier API symmetric
  ANDROID: Shrink ashmem directly through shmem_fallocate
  ANDROID: ashmem: Add shmem_set_file to mm/shmem.c
  ANDROID: Add android config documentation to boot framework.

Conflicts:
	Documentation/android.txt
	Documentation/cpu-freq/governors.txt
	Documentation/networking/ip-sysctl.txt
	arch/arm64/boot/Makefile
	arch/arm64/kernel/process.c
	arch/arm64/kernel/topology.c
	arch/arm64/mm/dma-mapping.c
	drivers/cpufreq/cpufreq_interactive.c
	drivers/input/misc/gpio_matrix.c
	drivers/md/Kconfig
	drivers/md/dm-android-verity.c
	drivers/md/dm-android-verity.h
	drivers/md/dm-linear.c
	drivers/md/dm-verity-fec.h
	drivers/mmc/core/host.c
	drivers/platform/goldfish/goldfish_pipe.c
	drivers/power/supply/power_supply_sysfs.c
	drivers/scsi/ufs/ufshcd.c
	drivers/scsi/ufs/ufshcd.h
	drivers/staging/android/fiq_debugger/fiq_debugger.c
	drivers/staging/android/lowmemorykiller.c
	drivers/usb/dwc3/core.h
	drivers/usb/gadget/Kconfig
	drivers/usb/gadget/configfs.c
	drivers/usb/gadget/function/Makefile
	drivers/usb/gadget/function/f_accessory.c
	drivers/usb/gadget/function/f_audio_source.c
	drivers/usb/gadget/function/f_fs.c
	drivers/usb/gadget/function/f_mtp.c
	drivers/usb/phy/Kconfig
	drivers/usb/phy/otg-wakelock.c
	fs/ext4/inode.c
	fs/ext4/ioctl.c
	fs/f2fs/data.c
	fs/proc/task_mmu.c
	fs/sdcardfs/derived_perm.c
	fs/sdcardfs/file.c
	fs/sdcardfs/inode.c
	fs/sdcardfs/lookup.c
	fs/sdcardfs/main.c
	fs/sdcardfs/packagelist.c
	fs/sdcardfs/sdcardfs.h
	fs/sdcardfs/super.c
	include/linux/mmc/core.h
	include/linux/power_supply.h
	include/linux/sched.h
	include/linux/usb/f_mtp.h
	include/net/fib_rules.h
	include/net/route.h
	include/trace/events/cpufreq_interactive.h
	include/trace/events/power.h
	include/trace/events/sched.h
	include/uapi/linux/magic.h
	include/uapi/linux/prctl.h
	init/Kconfig
	kernel/cgroup.c
	kernel/configs/android-base.config
	kernel/power/process.c
	kernel/sched/Makefile
	kernel/sched/core.c
	kernel/sched/cputime.c
	kernel/sched/fair.c
	kernel/sched/rt.c
	kernel/sched/sched.h
	kernel/sched/stop_task.c
	kernel/sched/tune.c
	lib/Kconfig.debug
	net/core/fib_rules.c
	net/ipv4/inet_connection_sock.c
	net/ipv4/ping.c
	net/ipv4/raw.c
	net/ipv4/route.c
	net/ipv4/syncookies.c
	net/ipv4/udp.c
	net/ipv6/af_inet6.c
	net/ipv6/ah6.c
	net/ipv6/datagram.c
	net/ipv6/esp6.c
	net/ipv6/icmp.c
	net/ipv6/inet6_connection_sock.c
	net/ipv6/ip6_vti.c
	net/ipv6/ipcomp6.c
	net/ipv6/ping.c
	net/ipv6/raw.c
	net/ipv6/route.c
	net/ipv6/syncookies.c
	net/ipv6/tcp_ipv6.c
	net/ipv6/udp.c

Change-Id: I82455dc7b564fc5d3ad2b784a0eb8f96c6b05d4c
Signed-off-by: Channagoud Kadabi <ckadabi@codeaurora.org>
2017-02-17 16:14:38 -08:00

8376 lines
210 KiB
C

/*
* NET3 Protocol independent device support routines.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Derived from the non IP parts of dev.c 1.0.19
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Mark Evans, <evansmp@uhura.aston.ac.uk>
*
* Additional Authors:
* Florian la Roche <rzsfl@rz.uni-sb.de>
* Alan Cox <gw4pts@gw4pts.ampr.org>
* David Hinds <dahinds@users.sourceforge.net>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
* Adam Sulmicki <adam@cfar.umd.edu>
* Pekka Riikonen <priikone@poesidon.pspt.fi>
*
* Changes:
* D.J. Barrow : Fixed bug where dev->refcnt gets set
* to 2 if register_netdev gets called
* before net_dev_init & also removed a
* few lines of code in the process.
* Alan Cox : device private ioctl copies fields back.
* Alan Cox : Transmit queue code does relevant
* stunts to keep the queue safe.
* Alan Cox : Fixed double lock.
* Alan Cox : Fixed promisc NULL pointer trap
* ???????? : Support the full private ioctl range
* Alan Cox : Moved ioctl permission check into
* drivers
* Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
* Alan Cox : 100 backlog just doesn't cut it when
* you start doing multicast video 8)
* Alan Cox : Rewrote net_bh and list manager.
* Alan Cox : Fix ETH_P_ALL echoback lengths.
* Alan Cox : Took out transmit every packet pass
* Saved a few bytes in the ioctl handler
* Alan Cox : Network driver sets packet type before
* calling netif_rx. Saves a function
* call a packet.
* Alan Cox : Hashed net_bh()
* Richard Kooijman: Timestamp fixes.
* Alan Cox : Wrong field in SIOCGIFDSTADDR
* Alan Cox : Device lock protection.
* Alan Cox : Fixed nasty side effect of device close
* changes.
* Rudi Cilibrasi : Pass the right thing to
* set_mac_address()
* Dave Miller : 32bit quantity for the device lock to
* make it work out on a Sparc.
* Bjorn Ekwall : Added KERNELD hack.
* Alan Cox : Cleaned up the backlog initialise.
* Craig Metz : SIOCGIFCONF fix if space for under
* 1 device.
* Thomas Bogendoerfer : Return ENODEV for dev_open, if there
* is no device open function.
* Andi Kleen : Fix error reporting for SIOCGIFCONF
* Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
* Cyrus Durgin : Cleaned for KMOD
* Adam Sulmicki : Bug Fix : Network Device Unload
* A network device unload needs to purge
* the backlog queue.
* Paul Rusty Russell : SIOCSIFNAME
* Pekka Riikonen : Netdev boot-time settings code
* Andrew Morton : Make unregister_netdevice wait
* indefinitely on dev->refcnt
* J Hadi Salim : - Backlog queue sampling
* - netif_rx() feedback
*/
#include <asm/uaccess.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/mutex.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/notifier.h>
#include <linux/skbuff.h>
#include <linux/bpf.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/busy_poll.h>
#include <linux/rtnetlink.h>
#include <linux/stat.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/pkt_sched.h>
#include <net/checksum.h>
#include <net/xfrm.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netpoll.h>
#include <linux/rcupdate.h>
#include <linux/delay.h>
#include <net/iw_handler.h>
#include <asm/current.h>
#include <linux/audit.h>
#include <linux/dmaengine.h>
#include <linux/err.h>
#include <linux/ctype.h>
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <net/ip.h>
#include <net/mpls.h>
#include <linux/ipv6.h>
#include <linux/in.h>
#include <linux/jhash.h>
#include <linux/random.h>
#include <trace/events/napi.h>
#include <trace/events/net.h>
#include <trace/events/skb.h>
#include <linux/pci.h>
#include <linux/inetdevice.h>
#include <linux/cpu_rmap.h>
#include <linux/static_key.h>
#include <linux/hashtable.h>
#include <linux/vmalloc.h>
#include <linux/if_macvlan.h>
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
#include <linux/netfilter_ingress.h>
#include <linux/sctp.h>
#include <linux/crash_dump.h>
#include "net-sysfs.h"
/* Instead of increasing this, you should create a hash table. */
#define MAX_GRO_SKBS 8
/* This should be increased if a protocol with a bigger head is added. */
#define GRO_MAX_HEAD (MAX_HEADER + 128)
static DEFINE_SPINLOCK(ptype_lock);
static DEFINE_SPINLOCK(offload_lock);
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
struct list_head ptype_all __read_mostly; /* Taps */
static struct list_head offload_base __read_mostly;
static int netif_rx_internal(struct sk_buff *skb);
static int call_netdevice_notifiers_info(unsigned long val,
struct net_device *dev,
struct netdev_notifier_info *info);
/*
* The @dev_base_head list is protected by @dev_base_lock and the rtnl
* semaphore.
*
* Pure readers hold dev_base_lock for reading, or rcu_read_lock()
*
* Writers must hold the rtnl semaphore while they loop through the
* dev_base_head list, and hold dev_base_lock for writing when they do the
* actual updates. This allows pure readers to access the list even
* while a writer is preparing to update it.
*
* To put it another way, dev_base_lock is held for writing only to
* protect against pure readers; the rtnl semaphore provides the
* protection against other writers.
*
* See, for example usages, register_netdevice() and
* unregister_netdevice(), which must be called with the rtnl
* semaphore held.
*/
DEFINE_RWLOCK(dev_base_lock);
EXPORT_SYMBOL(dev_base_lock);
/* protects napi_hash addition/deletion and napi_gen_id */
static DEFINE_SPINLOCK(napi_hash_lock);
static unsigned int napi_gen_id = NR_CPUS;
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
static seqcount_t devnet_rename_seq;
static inline void dev_base_seq_inc(struct net *net)
{
while (++net->dev_base_seq == 0);
}
static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
{
unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
}
static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
{
return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
}
static inline void rps_lock(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
spin_lock(&sd->input_pkt_queue.lock);
#endif
}
static inline void rps_unlock(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
spin_unlock(&sd->input_pkt_queue.lock);
#endif
}
/* Device list insertion */
static void list_netdevice(struct net_device *dev)
{
struct net *net = dev_net(dev);
ASSERT_RTNL();
write_lock_bh(&dev_base_lock);
list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
hlist_add_head_rcu(&dev->index_hlist,
dev_index_hash(net, dev->ifindex));
write_unlock_bh(&dev_base_lock);
dev_base_seq_inc(net);
}
/* Device list removal
* caller must respect a RCU grace period before freeing/reusing dev
*/
static void unlist_netdevice(struct net_device *dev)
{
ASSERT_RTNL();
/* Unlink dev from the device chain */
write_lock_bh(&dev_base_lock);
list_del_rcu(&dev->dev_list);
hlist_del_rcu(&dev->name_hlist);
hlist_del_rcu(&dev->index_hlist);
write_unlock_bh(&dev_base_lock);
dev_base_seq_inc(dev_net(dev));
}
/*
* Our notifier list
*/
static RAW_NOTIFIER_HEAD(netdev_chain);
/*
* Device drivers call our routines to queue packets here. We empty the
* queue in the local softnet handler.
*/
DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
EXPORT_PER_CPU_SYMBOL(softnet_data);
#ifdef CONFIG_LOCKDEP
/*
* register_netdevice() inits txq->_xmit_lock and sets lockdep class
* according to dev->type
*/
static const unsigned short netdev_lock_type[] =
{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
static const char *const netdev_lock_name[] =
{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
static inline unsigned short netdev_lock_pos(unsigned short dev_type)
{
int i;
for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
if (netdev_lock_type[i] == dev_type)
return i;
/* the last key is used by default */
return ARRAY_SIZE(netdev_lock_type) - 1;
}
static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
unsigned short dev_type)
{
int i;
i = netdev_lock_pos(dev_type);
lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
netdev_lock_name[i]);
}
static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
{
int i;
i = netdev_lock_pos(dev->type);
lockdep_set_class_and_name(&dev->addr_list_lock,
&netdev_addr_lock_key[i],
netdev_lock_name[i]);
}
#else
static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
unsigned short dev_type)
{
}
static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
{
}
#endif
/*******************************************************************************
Protocol management and registration routines
*******************************************************************************/
/*
* Add a protocol ID to the list. Now that the input handler is
* smarter we can dispense with all the messy stuff that used to be
* here.
*
* BEWARE!!! Protocol handlers, mangling input packets,
* MUST BE last in hash buckets and checking protocol handlers
* MUST start from promiscuous ptype_all chain in net_bh.
* It is true now, do not change it.
* Explanation follows: if protocol handler, mangling packet, will
* be the first on list, it is not able to sense, that packet
* is cloned and should be copied-on-write, so that it will
* change it and subsequent readers will get broken packet.
* --ANK (980803)
*/
static inline struct list_head *ptype_head(const struct packet_type *pt)
{
if (pt->type == htons(ETH_P_ALL))
return pt->dev ? &pt->dev->ptype_all : &ptype_all;
else
return pt->dev ? &pt->dev->ptype_specific :
&ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
}
/**
* dev_add_pack - add packet handler
* @pt: packet type declaration
*
* Add a protocol handler to the networking stack. The passed &packet_type
* is linked into kernel lists and may not be freed until it has been
* removed from the kernel lists.
*
* This call does not sleep therefore it can not
* guarantee all CPU's that are in middle of receiving packets
* will see the new packet type (until the next received packet).
*/
void dev_add_pack(struct packet_type *pt)
{
struct list_head *head = ptype_head(pt);
spin_lock(&ptype_lock);
list_add_rcu(&pt->list, head);
spin_unlock(&ptype_lock);
}
EXPORT_SYMBOL(dev_add_pack);
/**
* __dev_remove_pack - remove packet handler
* @pt: packet type declaration
*
* Remove a protocol handler that was previously added to the kernel
* protocol handlers by dev_add_pack(). The passed &packet_type is removed
* from the kernel lists and can be freed or reused once this function
* returns.
*
* The packet type might still be in use by receivers
* and must not be freed until after all the CPU's have gone
* through a quiescent state.
*/
void __dev_remove_pack(struct packet_type *pt)
{
struct list_head *head = ptype_head(pt);
struct packet_type *pt1;
spin_lock(&ptype_lock);
list_for_each_entry(pt1, head, list) {
if (pt == pt1) {
list_del_rcu(&pt->list);
goto out;
}
}
pr_warn("dev_remove_pack: %p not found\n", pt);
out:
spin_unlock(&ptype_lock);
}
EXPORT_SYMBOL(__dev_remove_pack);
/**
* dev_remove_pack - remove packet handler
* @pt: packet type declaration
*
* Remove a protocol handler that was previously added to the kernel
* protocol handlers by dev_add_pack(). The passed &packet_type is removed
* from the kernel lists and can be freed or reused once this function
* returns.
*
* This call sleeps to guarantee that no CPU is looking at the packet
* type after return.
*/
void dev_remove_pack(struct packet_type *pt)
{
__dev_remove_pack(pt);
synchronize_net();
}
EXPORT_SYMBOL(dev_remove_pack);
/**
* dev_add_offload - register offload handlers
* @po: protocol offload declaration
*
* Add protocol offload handlers to the networking stack. The passed
* &proto_offload is linked into kernel lists and may not be freed until
* it has been removed from the kernel lists.
*
* This call does not sleep therefore it can not
* guarantee all CPU's that are in middle of receiving packets
* will see the new offload handlers (until the next received packet).
*/
void dev_add_offload(struct packet_offload *po)
{
struct packet_offload *elem;
spin_lock(&offload_lock);
list_for_each_entry(elem, &offload_base, list) {
if (po->priority < elem->priority)
break;
}
list_add_rcu(&po->list, elem->list.prev);
spin_unlock(&offload_lock);
}
EXPORT_SYMBOL(dev_add_offload);
/**
* __dev_remove_offload - remove offload handler
* @po: packet offload declaration
*
* Remove a protocol offload handler that was previously added to the
* kernel offload handlers by dev_add_offload(). The passed &offload_type
* is removed from the kernel lists and can be freed or reused once this
* function returns.
*
* The packet type might still be in use by receivers
* and must not be freed until after all the CPU's have gone
* through a quiescent state.
*/
static void __dev_remove_offload(struct packet_offload *po)
{
struct list_head *head = &offload_base;
struct packet_offload *po1;
spin_lock(&offload_lock);
list_for_each_entry(po1, head, list) {
if (po == po1) {
list_del_rcu(&po->list);
goto out;
}
}
pr_warn("dev_remove_offload: %p not found\n", po);
out:
spin_unlock(&offload_lock);
}
/**
* dev_remove_offload - remove packet offload handler
* @po: packet offload declaration
*
* Remove a packet offload handler that was previously added to the kernel
* offload handlers by dev_add_offload(). The passed &offload_type is
* removed from the kernel lists and can be freed or reused once this
* function returns.
*
* This call sleeps to guarantee that no CPU is looking at the packet
* type after return.
*/
void dev_remove_offload(struct packet_offload *po)
{
__dev_remove_offload(po);
synchronize_net();
}
EXPORT_SYMBOL(dev_remove_offload);
/******************************************************************************
Device Boot-time Settings Routines
*******************************************************************************/
/* Boot time configuration table */
static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
/**
* netdev_boot_setup_add - add new setup entry
* @name: name of the device
* @map: configured settings for the device
*
* Adds new setup entry to the dev_boot_setup list. The function
* returns 0 on error and 1 on success. This is a generic routine to
* all netdevices.
*/
static int netdev_boot_setup_add(char *name, struct ifmap *map)
{
struct netdev_boot_setup *s;
int i;
s = dev_boot_setup;
for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
memset(s[i].name, 0, sizeof(s[i].name));
strlcpy(s[i].name, name, IFNAMSIZ);
memcpy(&s[i].map, map, sizeof(s[i].map));
break;
}
}
return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
}
/**
* netdev_boot_setup_check - check boot time settings
* @dev: the netdevice
*
* Check boot time settings for the device.
* The found settings are set for the device to be used
* later in the device probing.
* Returns 0 if no settings found, 1 if they are.
*/
int netdev_boot_setup_check(struct net_device *dev)
{
struct netdev_boot_setup *s = dev_boot_setup;
int i;
for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
!strcmp(dev->name, s[i].name)) {
dev->irq = s[i].map.irq;
dev->base_addr = s[i].map.base_addr;
dev->mem_start = s[i].map.mem_start;
dev->mem_end = s[i].map.mem_end;
return 1;
}
}
return 0;
}
EXPORT_SYMBOL(netdev_boot_setup_check);
/**
* netdev_boot_base - get address from boot time settings
* @prefix: prefix for network device
* @unit: id for network device
*
* Check boot time settings for the base address of device.
* The found settings are set for the device to be used
* later in the device probing.
* Returns 0 if no settings found.
*/
unsigned long netdev_boot_base(const char *prefix, int unit)
{
const struct netdev_boot_setup *s = dev_boot_setup;
char name[IFNAMSIZ];
int i;
sprintf(name, "%s%d", prefix, unit);
/*
* If device already registered then return base of 1
* to indicate not to probe for this interface
*/
if (__dev_get_by_name(&init_net, name))
return 1;
for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
if (!strcmp(name, s[i].name))
return s[i].map.base_addr;
return 0;
}
/*
* Saves at boot time configured settings for any netdevice.
*/
int __init netdev_boot_setup(char *str)
{
int ints[5];
struct ifmap map;
str = get_options(str, ARRAY_SIZE(ints), ints);
if (!str || !*str)
return 0;
/* Save settings */
memset(&map, 0, sizeof(map));
if (ints[0] > 0)
map.irq = ints[1];
if (ints[0] > 1)
map.base_addr = ints[2];
if (ints[0] > 2)
map.mem_start = ints[3];
if (ints[0] > 3)
map.mem_end = ints[4];
/* Add new entry to the list */
return netdev_boot_setup_add(str, &map);
}
__setup("netdev=", netdev_boot_setup);
/*******************************************************************************
Device Interface Subroutines
*******************************************************************************/
/**
* dev_get_iflink - get 'iflink' value of a interface
* @dev: targeted interface
*
* Indicates the ifindex the interface is linked to.
* Physical interfaces have the same 'ifindex' and 'iflink' values.
*/
int dev_get_iflink(const struct net_device *dev)
{
if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
return dev->netdev_ops->ndo_get_iflink(dev);
return dev->ifindex;
}
EXPORT_SYMBOL(dev_get_iflink);
/**
* dev_fill_metadata_dst - Retrieve tunnel egress information.
* @dev: targeted interface
* @skb: The packet.
*
* For better visibility of tunnel traffic OVS needs to retrieve
* egress tunnel information for a packet. Following API allows
* user to get this info.
*/
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
struct ip_tunnel_info *info;
if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
return -EINVAL;
info = skb_tunnel_info_unclone(skb);
if (!info)
return -ENOMEM;
if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
return -EINVAL;
return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
}
EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
/**
* __dev_get_by_name - find a device by its name
* @net: the applicable net namespace
* @name: name to find
*
* Find an interface by name. Must be called under RTNL semaphore
* or @dev_base_lock. If the name is found a pointer to the device
* is returned. If the name is not found then %NULL is returned. The
* reference counters are not incremented so the caller must be
* careful with locks.
*/
struct net_device *__dev_get_by_name(struct net *net, const char *name)
{
struct net_device *dev;
struct hlist_head *head = dev_name_hash(net, name);
hlist_for_each_entry(dev, head, name_hlist)
if (!strncmp(dev->name, name, IFNAMSIZ))
return dev;
return NULL;
}
EXPORT_SYMBOL(__dev_get_by_name);
/**
* dev_get_by_name_rcu - find a device by its name
* @net: the applicable net namespace
* @name: name to find
*
* Find an interface by name.
* If the name is found a pointer to the device is returned.
* If the name is not found then %NULL is returned.
* The reference counters are not incremented so the caller must be
* careful with locks. The caller must hold RCU lock.
*/
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
{
struct net_device *dev;
struct hlist_head *head = dev_name_hash(net, name);
hlist_for_each_entry_rcu(dev, head, name_hlist)
if (!strncmp(dev->name, name, IFNAMSIZ))
return dev;
return NULL;
}
EXPORT_SYMBOL(dev_get_by_name_rcu);
/**
* dev_get_by_name - find a device by its name
* @net: the applicable net namespace
* @name: name to find
*
* Find an interface by name. This can be called from any
* context and does its own locking. The returned handle has
* the usage count incremented and the caller must use dev_put() to
* release it when it is no longer needed. %NULL is returned if no
* matching device is found.
*/
struct net_device *dev_get_by_name(struct net *net, const char *name)
{
struct net_device *dev;
rcu_read_lock();
dev = dev_get_by_name_rcu(net, name);
if (dev)
dev_hold(dev);
rcu_read_unlock();
return dev;
}
EXPORT_SYMBOL(dev_get_by_name);
/**
* __dev_get_by_index - find a device by its ifindex
* @net: the applicable net namespace
* @ifindex: index of device
*
* Search for an interface by index. Returns %NULL if the device
* is not found or a pointer to the device. The device has not
* had its reference counter increased so the caller must be careful
* about locking. The caller must hold either the RTNL semaphore
* or @dev_base_lock.
*/
struct net_device *__dev_get_by_index(struct net *net, int ifindex)
{
struct net_device *dev;
struct hlist_head *head = dev_index_hash(net, ifindex);
hlist_for_each_entry(dev, head, index_hlist)
if (dev->ifindex == ifindex)
return dev;
return NULL;
}
EXPORT_SYMBOL(__dev_get_by_index);
/**
* dev_get_by_index_rcu - find a device by its ifindex
* @net: the applicable net namespace
* @ifindex: index of device
*
* Search for an interface by index. Returns %NULL if the device
* is not found or a pointer to the device. The device has not
* had its reference counter increased so the caller must be careful
* about locking. The caller must hold RCU lock.
*/
struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
{
struct net_device *dev;
struct hlist_head *head = dev_index_hash(net, ifindex);
hlist_for_each_entry_rcu(dev, head, index_hlist)
if (dev->ifindex == ifindex)
return dev;
return NULL;
}
EXPORT_SYMBOL(dev_get_by_index_rcu);
/**
* dev_get_by_index - find a device by its ifindex
* @net: the applicable net namespace
* @ifindex: index of device
*
* Search for an interface by index. Returns NULL if the device
* is not found or a pointer to the device. The device returned has
* had a reference added and the pointer is safe until the user calls
* dev_put to indicate they have finished with it.
*/
struct net_device *dev_get_by_index(struct net *net, int ifindex)
{
struct net_device *dev;
rcu_read_lock();
dev = dev_get_by_index_rcu(net, ifindex);
if (dev)
dev_hold(dev);
rcu_read_unlock();
return dev;
}
EXPORT_SYMBOL(dev_get_by_index);
/**
* netdev_get_name - get a netdevice name, knowing its ifindex.
* @net: network namespace
* @name: a pointer to the buffer where the name will be stored.
* @ifindex: the ifindex of the interface to get the name from.
*
* The use of raw_seqcount_begin() and cond_resched() before
* retrying is required as we want to give the writers a chance
* to complete when CONFIG_PREEMPT is not set.
*/
int netdev_get_name(struct net *net, char *name, int ifindex)
{
struct net_device *dev;
unsigned int seq;
retry:
seq = raw_seqcount_begin(&devnet_rename_seq);
rcu_read_lock();
dev = dev_get_by_index_rcu(net, ifindex);
if (!dev) {
rcu_read_unlock();
return -ENODEV;
}
strcpy(name, dev->name);
rcu_read_unlock();
if (read_seqcount_retry(&devnet_rename_seq, seq)) {
cond_resched();
goto retry;
}
return 0;
}
/**
* dev_getbyhwaddr_rcu - find a device by its hardware address
* @net: the applicable net namespace
* @type: media type of device
* @ha: hardware address
*
* Search for an interface by MAC address. Returns NULL if the device
* is not found or a pointer to the device.
* The caller must hold RCU or RTNL.
* The returned device has not had its ref count increased
* and the caller must therefore be careful about locking
*
*/
struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
const char *ha)
{
struct net_device *dev;
for_each_netdev_rcu(net, dev)
if (dev->type == type &&
!memcmp(dev->dev_addr, ha, dev->addr_len))
return dev;
return NULL;
}
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
struct net_device *dev;
ASSERT_RTNL();
for_each_netdev(net, dev)
if (dev->type == type)
return dev;
return NULL;
}
EXPORT_SYMBOL(__dev_getfirstbyhwtype);
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
struct net_device *dev, *ret = NULL;
rcu_read_lock();
for_each_netdev_rcu(net, dev)
if (dev->type == type) {
dev_hold(dev);
ret = dev;
break;
}
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL(dev_getfirstbyhwtype);
/**
* __dev_get_by_flags - find any device with given flags
* @net: the applicable net namespace
* @if_flags: IFF_* values
* @mask: bitmask of bits in if_flags to check
*
* Search for any interface with the given flags. Returns NULL if a device
* is not found or a pointer to the device. Must be called inside
* rtnl_lock(), and result refcount is unchanged.
*/
struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
unsigned short mask)
{
struct net_device *dev, *ret;
ASSERT_RTNL();
ret = NULL;
for_each_netdev(net, dev) {
if (((dev->flags ^ if_flags) & mask) == 0) {
ret = dev;
break;
}
}
return ret;
}
EXPORT_SYMBOL(__dev_get_by_flags);
/**
* dev_valid_name - check if name is okay for network device
* @name: name string
*
* Network device names need to be valid file names to
* to allow sysfs to work. We also disallow any kind of
* whitespace.
*/
bool dev_valid_name(const char *name)
{
if (*name == '\0')
return false;
if (strlen(name) >= IFNAMSIZ)
return false;
if (!strcmp(name, ".") || !strcmp(name, ".."))
return false;
while (*name) {
if (*name == '/' || *name == ':' || isspace(*name))
return false;
name++;
}
return true;
}
EXPORT_SYMBOL(dev_valid_name);
/**
* __dev_alloc_name - allocate a name for a device
* @net: network namespace to allocate the device name in
* @name: name format string
* @buf: scratch buffer and result name string
*
* Passed a format string - eg "lt%d" it will try and find a suitable
* id. It scans list of devices to build up a free map, then chooses
* the first empty slot. The caller must hold the dev_base or rtnl lock
* while allocating the name and adding the device in order to avoid
* duplicates.
* Limited to bits_per_byte * page size devices (ie 32K on most platforms).
* Returns the number of the unit assigned or a negative errno code.
*/
static int __dev_alloc_name(struct net *net, const char *name, char *buf)
{
int i = 0;
const char *p;
const int max_netdevices = 8*PAGE_SIZE;
unsigned long *inuse;
struct net_device *d;
p = strnchr(name, IFNAMSIZ-1, '%');
if (p) {
/*
* Verify the string as this thing may have come from
* the user. There must be either one "%d" and no other "%"
* characters.
*/
if (p[1] != 'd' || strchr(p + 2, '%'))
return -EINVAL;
/* Use one page as a bit array of possible slots */
inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
if (!inuse)
return -ENOMEM;
for_each_netdev(net, d) {
if (!sscanf(d->name, name, &i))
continue;
if (i < 0 || i >= max_netdevices)
continue;
/* avoid cases where sscanf is not exact inverse of printf */
snprintf(buf, IFNAMSIZ, name, i);
if (!strncmp(buf, d->name, IFNAMSIZ))
set_bit(i, inuse);
}
i = find_first_zero_bit(inuse, max_netdevices);
free_page((unsigned long) inuse);
}
if (buf != name)
snprintf(buf, IFNAMSIZ, name, i);
if (!__dev_get_by_name(net, buf))
return i;
/* It is possible to run out of possible slots
* when the name is long and there isn't enough space left
* for the digits, or if all bits are used.
*/
return -ENFILE;
}
/**
* dev_alloc_name - allocate a name for a device
* @dev: device
* @name: name format string
*
* Passed a format string - eg "lt%d" it will try and find a suitable
* id. It scans list of devices to build up a free map, then chooses
* the first empty slot. The caller must hold the dev_base or rtnl lock
* while allocating the name and adding the device in order to avoid
* duplicates.
* Limited to bits_per_byte * page size devices (ie 32K on most platforms).
* Returns the number of the unit assigned or a negative errno code.
*/
int dev_alloc_name(struct net_device *dev, const char *name)
{
char buf[IFNAMSIZ];
struct net *net;
int ret;
BUG_ON(!dev_net(dev));
net = dev_net(dev);
ret = __dev_alloc_name(net, name, buf);
if (ret >= 0)
strlcpy(dev->name, buf, IFNAMSIZ);
return ret;
}
EXPORT_SYMBOL(dev_alloc_name);
static int dev_alloc_name_ns(struct net *net,
struct net_device *dev,
const char *name)
{
char buf[IFNAMSIZ];
int ret;
ret = __dev_alloc_name(net, name, buf);
if (ret >= 0)
strlcpy(dev->name, buf, IFNAMSIZ);
return ret;
}
static int dev_get_valid_name(struct net *net,
struct net_device *dev,
const char *name)
{
BUG_ON(!net);
if (!dev_valid_name(name))
return -EINVAL;
if (strchr(name, '%'))
return dev_alloc_name_ns(net, dev, name);
else if (__dev_get_by_name(net, name))
return -EEXIST;
else if (dev->name != name)
strlcpy(dev->name, name, IFNAMSIZ);
return 0;
}
/**
* dev_change_name - change name of a device
* @dev: device
* @newname: name (or format string) must be at least IFNAMSIZ
*
* Change name of a device, can pass format strings "eth%d".
* for wildcarding.
*/
int dev_change_name(struct net_device *dev, const char *newname)
{
unsigned char old_assign_type;
char oldname[IFNAMSIZ];
int err = 0;
int ret;
struct net *net;
ASSERT_RTNL();
BUG_ON(!dev_net(dev));
net = dev_net(dev);
if (dev->flags & IFF_UP)
return -EBUSY;
write_seqcount_begin(&devnet_rename_seq);
if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
write_seqcount_end(&devnet_rename_seq);
return 0;
}
memcpy(oldname, dev->name, IFNAMSIZ);
err = dev_get_valid_name(net, dev, newname);
if (err < 0) {
write_seqcount_end(&devnet_rename_seq);
return err;
}
if (oldname[0] && !strchr(oldname, '%'))
netdev_info(dev, "renamed from %s\n", oldname);
old_assign_type = dev->name_assign_type;
dev->name_assign_type = NET_NAME_RENAMED;
rollback:
ret = device_rename(&dev->dev, dev->name);
if (ret) {
memcpy(dev->name, oldname, IFNAMSIZ);
dev->name_assign_type = old_assign_type;
write_seqcount_end(&devnet_rename_seq);
return ret;
}
write_seqcount_end(&devnet_rename_seq);
netdev_adjacent_rename_links(dev, oldname);
write_lock_bh(&dev_base_lock);
hlist_del_rcu(&dev->name_hlist);
write_unlock_bh(&dev_base_lock);
synchronize_rcu();
write_lock_bh(&dev_base_lock);
hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
write_unlock_bh(&dev_base_lock);
ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
ret = notifier_to_errno(ret);
if (ret) {
/* err >= 0 after dev_alloc_name() or stores the first errno */
if (err >= 0) {
err = ret;
write_seqcount_begin(&devnet_rename_seq);
memcpy(dev->name, oldname, IFNAMSIZ);
memcpy(oldname, newname, IFNAMSIZ);
dev->name_assign_type = old_assign_type;
old_assign_type = NET_NAME_RENAMED;
goto rollback;
} else {
pr_err("%s: name change rollback failed: %d\n",
dev->name, ret);
}
}
return err;
}
/**
* dev_set_alias - change ifalias of a device
* @dev: device
* @alias: name up to IFALIASZ
* @len: limit of bytes to copy from info
*
* Set ifalias for a device,
*/
int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
{
char *new_ifalias;
ASSERT_RTNL();
if (len >= IFALIASZ)
return -EINVAL;
if (!len) {
kfree(dev->ifalias);
dev->ifalias = NULL;
return 0;
}
new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
if (!new_ifalias)
return -ENOMEM;
dev->ifalias = new_ifalias;
strlcpy(dev->ifalias, alias, len+1);
return len;
}
/**
* netdev_features_change - device changes features
* @dev: device to cause notification
*
* Called to indicate a device has changed features.
*/
void netdev_features_change(struct net_device *dev)
{
call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
}
EXPORT_SYMBOL(netdev_features_change);
/**
* netdev_state_change - device changes state
* @dev: device to cause notification
*
* Called to indicate a device has changed state. This function calls
* the notifier chains for netdev_chain and sends a NEWLINK message
* to the routing socket.
*/
void netdev_state_change(struct net_device *dev)
{
if (dev->flags & IFF_UP) {
struct netdev_notifier_change_info change_info;
change_info.flags_changed = 0;
call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
&change_info.info);
rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
}
}
EXPORT_SYMBOL(netdev_state_change);
/**
* netdev_notify_peers - notify network peers about existence of @dev
* @dev: network device
*
* Generate traffic such that interested network peers are aware of
* @dev, such as by generating a gratuitous ARP. This may be used when
* a device wants to inform the rest of the network about some sort of
* reconfiguration such as a failover event or virtual machine
* migration.
*/
void netdev_notify_peers(struct net_device *dev)
{
rtnl_lock();
call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
rtnl_unlock();
}
EXPORT_SYMBOL(netdev_notify_peers);
static int __dev_open(struct net_device *dev)
{
const struct net_device_ops *ops = dev->netdev_ops;
int ret;
ASSERT_RTNL();
if (!netif_device_present(dev))
return -ENODEV;
/* Block netpoll from trying to do any rx path servicing.
* If we don't do this there is a chance ndo_poll_controller
* or ndo_poll may be running while we open the device
*/
netpoll_poll_disable(dev);
ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
ret = notifier_to_errno(ret);
if (ret)
return ret;
set_bit(__LINK_STATE_START, &dev->state);
if (ops->ndo_validate_addr)
ret = ops->ndo_validate_addr(dev);
if (!ret && ops->ndo_open)
ret = ops->ndo_open(dev);
netpoll_poll_enable(dev);
if (ret)
clear_bit(__LINK_STATE_START, &dev->state);
else {
dev->flags |= IFF_UP;
dev_set_rx_mode(dev);
dev_activate(dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
}
return ret;
}
/**
* dev_open - prepare an interface for use.
* @dev: device to open
*
* Takes a device from down to up state. The device's private open
* function is invoked and then the multicast lists are loaded. Finally
* the device is moved into the up state and a %NETDEV_UP message is
* sent to the netdev notifier chain.
*
* Calling this function on an active interface is a nop. On a failure
* a negative errno code is returned.
*/
int dev_open(struct net_device *dev)
{
int ret;
if (dev->flags & IFF_UP)
return 0;
ret = __dev_open(dev);
if (ret < 0)
return ret;
rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
call_netdevice_notifiers(NETDEV_UP, dev);
return ret;
}
EXPORT_SYMBOL(dev_open);
static int __dev_close_many(struct list_head *head)
{
struct net_device *dev;
ASSERT_RTNL();
might_sleep();
list_for_each_entry(dev, head, close_list) {
/* Temporarily disable netpoll until the interface is down */
netpoll_poll_disable(dev);
call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
clear_bit(__LINK_STATE_START, &dev->state);
/* Synchronize to scheduled poll. We cannot touch poll list, it
* can be even on different cpu. So just clear netif_running().
*
* dev->stop() will invoke napi_disable() on all of it's
* napi_struct instances on this device.
*/
smp_mb__after_atomic(); /* Commit netif_running(). */
}
dev_deactivate_many(head);
list_for_each_entry(dev, head, close_list) {
const struct net_device_ops *ops = dev->netdev_ops;
/*
* Call the device specific close. This cannot fail.
* Only if device is UP
*
* We allow it to be called even after a DETACH hot-plug
* event.
*/
if (ops->ndo_stop)
ops->ndo_stop(dev);
dev->flags &= ~IFF_UP;
netpoll_poll_enable(dev);
}
return 0;
}
static int __dev_close(struct net_device *dev)
{
int retval;
LIST_HEAD(single);
list_add(&dev->close_list, &single);
retval = __dev_close_many(&single);
list_del(&single);
return retval;
}
int dev_close_many(struct list_head *head, bool unlink)
{
struct net_device *dev, *tmp;
/* Remove the devices that don't need to be closed */
list_for_each_entry_safe(dev, tmp, head, close_list)
if (!(dev->flags & IFF_UP))
list_del_init(&dev->close_list);
__dev_close_many(head);
list_for_each_entry_safe(dev, tmp, head, close_list) {
rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
call_netdevice_notifiers(NETDEV_DOWN, dev);
if (unlink)
list_del_init(&dev->close_list);
}
return 0;
}
EXPORT_SYMBOL(dev_close_many);
/**
* dev_close - shutdown an interface.
* @dev: device to shutdown
*
* This function moves an active device into down state. A
* %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
* is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
* chain.
*/
int dev_close(struct net_device *dev)
{
if (dev->flags & IFF_UP) {
LIST_HEAD(single);
list_add(&dev->close_list, &single);
dev_close_many(&single, true);
list_del(&single);
}
return 0;
}
EXPORT_SYMBOL(dev_close);
/**
* dev_disable_lro - disable Large Receive Offload on a device
* @dev: device
*
* Disable Large Receive Offload (LRO) on a net device. Must be
* called under RTNL. This is needed if received packets may be
* forwarded to another interface.
*/
void dev_disable_lro(struct net_device *dev)
{
struct net_device *lower_dev;
struct list_head *iter;
dev->wanted_features &= ~NETIF_F_LRO;
netdev_update_features(dev);
if (unlikely(dev->features & NETIF_F_LRO))
netdev_WARN(dev, "failed to disable LRO!\n");
netdev_for_each_lower_dev(dev, lower_dev, iter)
dev_disable_lro(lower_dev);
}
EXPORT_SYMBOL(dev_disable_lro);
static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
struct net_device *dev)
{
struct netdev_notifier_info info;
netdev_notifier_info_init(&info, dev);
return nb->notifier_call(nb, val, &info);
}
static int dev_boot_phase = 1;
/**
* register_netdevice_notifier - register a network notifier block
* @nb: notifier
*
* Register a notifier to be called when network device events occur.
* The notifier passed is linked into the kernel structures and must
* not be reused until it has been unregistered. A negative errno code
* is returned on a failure.
*
* When registered all registration and up events are replayed
* to the new notifier to allow device to have a race free
* view of the network device list.
*/
int register_netdevice_notifier(struct notifier_block *nb)
{
struct net_device *dev;
struct net_device *last;
struct net *net;
int err;
rtnl_lock();
err = raw_notifier_chain_register(&netdev_chain, nb);
if (err)
goto unlock;
if (dev_boot_phase)
goto unlock;
for_each_net(net) {
for_each_netdev(net, dev) {
err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
err = notifier_to_errno(err);
if (err)
goto rollback;
if (!(dev->flags & IFF_UP))
continue;
call_netdevice_notifier(nb, NETDEV_UP, dev);
}
}
unlock:
rtnl_unlock();
return err;
rollback:
last = dev;
for_each_net(net) {
for_each_netdev(net, dev) {
if (dev == last)
goto outroll;
if (dev->flags & IFF_UP) {
call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
dev);
call_netdevice_notifier(nb, NETDEV_DOWN, dev);
}
call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
}
}
outroll:
raw_notifier_chain_unregister(&netdev_chain, nb);
goto unlock;
}
EXPORT_SYMBOL(register_netdevice_notifier);
/**
* unregister_netdevice_notifier - unregister a network notifier block
* @nb: notifier
*
* Unregister a notifier previously registered by
* register_netdevice_notifier(). The notifier is unlinked into the
* kernel structures and may then be reused. A negative errno code
* is returned on a failure.
*
* After unregistering unregister and down device events are synthesized
* for all devices on the device list to the removed notifier to remove
* the need for special case cleanup code.
*/
int unregister_netdevice_notifier(struct notifier_block *nb)
{
struct net_device *dev;
struct net *net;
int err;
rtnl_lock();
err = raw_notifier_chain_unregister(&netdev_chain, nb);
if (err)
goto unlock;
for_each_net(net) {
for_each_netdev(net, dev) {
if (dev->flags & IFF_UP) {
call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
dev);
call_netdevice_notifier(nb, NETDEV_DOWN, dev);
}
call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
}
}
unlock:
rtnl_unlock();
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier);
/**
* call_netdevice_notifiers_info - call all network notifier blocks
* @val: value passed unmodified to notifier function
* @dev: net_device pointer passed unmodified to notifier function
* @info: notifier information data
*
* Call all network notifier blocks. Parameters and return value
* are as for raw_notifier_call_chain().
*/
static int call_netdevice_notifiers_info(unsigned long val,
struct net_device *dev,
struct netdev_notifier_info *info)
{
ASSERT_RTNL();
netdev_notifier_info_init(info, dev);
return raw_notifier_call_chain(&netdev_chain, val, info);
}
/**
* call_netdevice_notifiers - call all network notifier blocks
* @val: value passed unmodified to notifier function
* @dev: net_device pointer passed unmodified to notifier function
*
* Call all network notifier blocks. Parameters and return value
* are as for raw_notifier_call_chain().
*/
int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
{
struct netdev_notifier_info info;
return call_netdevice_notifiers_info(val, dev, &info);
}
EXPORT_SYMBOL(call_netdevice_notifiers);
#ifdef CONFIG_NET_INGRESS
static struct static_key ingress_needed __read_mostly;
void net_inc_ingress_queue(void)
{
static_key_slow_inc(&ingress_needed);
}
EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
void net_dec_ingress_queue(void)
{
static_key_slow_dec(&ingress_needed);
}
EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
#endif
#ifdef CONFIG_NET_EGRESS
static struct static_key egress_needed __read_mostly;
void net_inc_egress_queue(void)
{
static_key_slow_inc(&egress_needed);
}
EXPORT_SYMBOL_GPL(net_inc_egress_queue);
void net_dec_egress_queue(void)
{
static_key_slow_dec(&egress_needed);
}
EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif
static struct static_key netstamp_needed __read_mostly;
#ifdef HAVE_JUMP_LABEL
/* We are not allowed to call static_key_slow_dec() from irq context
* If net_disable_timestamp() is called from irq context, defer the
* static_key_slow_dec() calls.
*/
static atomic_t netstamp_needed_deferred;
#endif
void net_enable_timestamp(void)
{
#ifdef HAVE_JUMP_LABEL
int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
if (deferred) {
while (--deferred)
static_key_slow_dec(&netstamp_needed);
return;
}
#endif
static_key_slow_inc(&netstamp_needed);
}
EXPORT_SYMBOL(net_enable_timestamp);
void net_disable_timestamp(void)
{
#ifdef HAVE_JUMP_LABEL
if (in_interrupt()) {
atomic_inc(&netstamp_needed_deferred);
return;
}
#endif
static_key_slow_dec(&netstamp_needed);
}
EXPORT_SYMBOL(net_disable_timestamp);
static inline void net_timestamp_set(struct sk_buff *skb)
{
skb->tstamp.tv64 = 0;
if (static_key_false(&netstamp_needed))
__net_timestamp(skb);
}
#define net_timestamp_check(COND, SKB) \
if (static_key_false(&netstamp_needed)) { \
if ((COND) && !(SKB)->tstamp.tv64) \
__net_timestamp(SKB); \
} \
bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
{
unsigned int len;
if (!(dev->flags & IFF_UP))
return false;
len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
if (skb->len <= len)
return true;
/* if TSO is enabled, we don't care about the length as the packet
* could be forwarded without being segmented before
*/
if (skb_is_gso(skb))
return true;
return false;
}
EXPORT_SYMBOL_GPL(is_skb_forwardable);
int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
int ret = ____dev_forward_skb(dev, skb);
if (likely(!ret)) {
skb->protocol = eth_type_trans(skb, dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
}
return ret;
}
EXPORT_SYMBOL_GPL(__dev_forward_skb);
/**
* dev_forward_skb - loopback an skb to another netif
*
* @dev: destination network device
* @skb: buffer to forward
*
* return values:
* NET_RX_SUCCESS (no congestion)
* NET_RX_DROP (packet was dropped, but freed)
*
* dev_forward_skb can be used for injecting an skb from the
* start_xmit function of one device into the receive queue
* of another device.
*
* The receiving device may be in another namespace, so
* we have to clear all information in the skb that could
* impact namespace isolation.
*/
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
}
EXPORT_SYMBOL_GPL(dev_forward_skb);
static inline int deliver_skb(struct sk_buff *skb,
struct packet_type *pt_prev,
struct net_device *orig_dev)
{
if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
return -ENOMEM;
atomic_inc(&skb->users);
return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}
static inline void deliver_ptype_list_skb(struct sk_buff *skb,
struct packet_type **pt,
struct net_device *orig_dev,
__be16 type,
struct list_head *ptype_list)
{
struct packet_type *ptype, *pt_prev = *pt;
list_for_each_entry_rcu(ptype, ptype_list, list) {
if (ptype->type != type)
continue;
if (pt_prev)
deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
*pt = pt_prev;
}
static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
{
if (!ptype->af_packet_priv || !skb->sk)
return false;
if (ptype->id_match)
return ptype->id_match(ptype, skb->sk);
else if ((struct sock *)ptype->af_packet_priv == skb->sk)
return true;
return false;
}
/*
* Support routine. Sends outgoing frames to any network
* taps currently in use.
*/
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
struct packet_type *ptype;
struct sk_buff *skb2 = NULL;
struct packet_type *pt_prev = NULL;
struct list_head *ptype_list = &ptype_all;
rcu_read_lock();
again:
list_for_each_entry_rcu(ptype, ptype_list, list) {
/* Never send packets back to the socket
* they originated from - MvS (miquels@drinkel.ow.org)
*/
if (skb_loop_sk(ptype, skb))
continue;
if (pt_prev) {
deliver_skb(skb2, pt_prev, skb->dev);
pt_prev = ptype;
continue;
}
/* need to clone skb, done only once */
skb2 = skb_clone(skb, GFP_ATOMIC);
if (!skb2)
goto out_unlock;
net_timestamp_set(skb2);
/* skb->nh should be correctly
* set by sender, so that the second statement is
* just protection against buggy protocols.
*/
skb_reset_mac_header(skb2);
if (skb_network_header(skb2) < skb2->data ||
skb_network_header(skb2) > skb_tail_pointer(skb2)) {
net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
ntohs(skb2->protocol),
dev->name);
skb_reset_network_header(skb2);
}
skb2->transport_header = skb2->network_header;
skb2->pkt_type = PACKET_OUTGOING;
pt_prev = ptype;
}
if (ptype_list == &ptype_all) {
ptype_list = &dev->ptype_all;
goto again;
}
out_unlock:
if (pt_prev)
pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
/**
* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
* @dev: Network device
* @txq: number of queues available
*
* If real_num_tx_queues is changed the tc mappings may no longer be
* valid. To resolve this verify the tc mapping remains valid and if
* not NULL the mapping. With no priorities mapping to this
* offset/count pair it will no longer be used. In the worst case TC0
* is invalid nothing can be done so disable priority mappings. If is
* expected that drivers will fix this mapping if they can before
* calling netif_set_real_num_tx_queues.
*/
static void netif_setup_tc(struct net_device *dev, unsigned int txq)
{
int i;
struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
/* If TC0 is invalidated disable TC mapping */
if (tc->offset + tc->count > txq) {
pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
dev->num_tc = 0;
return;
}
/* Invalidated prio to tc mappings set to TC0 */
for (i = 1; i < TC_BITMASK + 1; i++) {
int q = netdev_get_prio_tc_map(dev, i);
tc = &dev->tc_to_txq[q];
if (tc->offset + tc->count > txq) {
pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
i, q);
netdev_set_prio_tc_map(dev, i, 0);
}
}
}
#ifdef CONFIG_XPS
static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P) \
rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
int cpu, u16 index)
{
struct xps_map *map = NULL;
int pos;
if (dev_maps)
map = xmap_dereference(dev_maps->cpu_map[cpu]);
for (pos = 0; map && pos < map->len; pos++) {
if (map->queues[pos] == index) {
if (map->len > 1) {
map->queues[pos] = map->queues[--map->len];
} else {
RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
kfree_rcu(map, rcu);
map = NULL;
}
break;
}
}
return map;
}
static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
{
struct xps_dev_maps *dev_maps;
int cpu, i;
bool active = false;
mutex_lock(&xps_map_mutex);
dev_maps = xmap_dereference(dev->xps_maps);
if (!dev_maps)
goto out_no_maps;
for_each_possible_cpu(cpu) {
for (i = index; i < dev->num_tx_queues; i++) {
if (!remove_xps_queue(dev_maps, cpu, i))
break;
}
if (i == dev->num_tx_queues)
active = true;
}
if (!active) {
RCU_INIT_POINTER(dev->xps_maps, NULL);
kfree_rcu(dev_maps, rcu);
}
for (i = index; i < dev->num_tx_queues; i++)
netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
NUMA_NO_NODE);
out_no_maps:
mutex_unlock(&xps_map_mutex);
}
static struct xps_map *expand_xps_map(struct xps_map *map,
int cpu, u16 index)
{
struct xps_map *new_map;
int alloc_len = XPS_MIN_MAP_ALLOC;
int i, pos;
for (pos = 0; map && pos < map->len; pos++) {
if (map->queues[pos] != index)
continue;
return map;
}
/* Need to add queue to this CPU's existing map */
if (map) {
if (pos < map->alloc_len)
return map;
alloc_len = map->alloc_len * 2;
}
/* Need to allocate new map to store queue on this CPU's map */
new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
cpu_to_node(cpu));
if (!new_map)
return NULL;
for (i = 0; i < pos; i++)
new_map->queues[i] = map->queues[i];
new_map->alloc_len = alloc_len;
new_map->len = pos;
return new_map;
}
int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
u16 index)
{
struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
struct xps_map *map, *new_map;
int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
int cpu, numa_node_id = -2;
bool active = false;
mutex_lock(&xps_map_mutex);
dev_maps = xmap_dereference(dev->xps_maps);
/* allocate memory for queue storage */
for_each_online_cpu(cpu) {
if (!cpumask_test_cpu(cpu, mask))
continue;
if (!new_dev_maps)
new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
if (!new_dev_maps) {
mutex_unlock(&xps_map_mutex);
return -ENOMEM;
}
map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
NULL;
map = expand_xps_map(map, cpu, index);
if (!map)
goto error;
RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
}
if (!new_dev_maps)
goto out_no_new_maps;
for_each_possible_cpu(cpu) {
if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
/* add queue to CPU maps */
int pos = 0;
map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
while ((pos < map->len) && (map->queues[pos] != index))
pos++;
if (pos == map->len)
map->queues[map->len++] = index;
#ifdef CONFIG_NUMA
if (numa_node_id == -2)
numa_node_id = cpu_to_node(cpu);
else if (numa_node_id != cpu_to_node(cpu))
numa_node_id = -1;
#endif
} else if (dev_maps) {
/* fill in the new device map from the old device map */
map = xmap_dereference(dev_maps->cpu_map[cpu]);
RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
}
}
rcu_assign_pointer(dev->xps_maps, new_dev_maps);
/* Cleanup old maps */
if (dev_maps) {
for_each_possible_cpu(cpu) {
new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
map = xmap_dereference(dev_maps->cpu_map[cpu]);
if (map && map != new_map)
kfree_rcu(map, rcu);
}
kfree_rcu(dev_maps, rcu);
}
dev_maps = new_dev_maps;
active = true;
out_no_new_maps:
/* update Tx queue numa node */
netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
(numa_node_id >= 0) ? numa_node_id :
NUMA_NO_NODE);
if (!dev_maps)
goto out_no_maps;
/* removes queue from unused CPUs */
for_each_possible_cpu(cpu) {
if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
continue;
if (remove_xps_queue(dev_maps, cpu, index))
active = true;
}
/* free map if not active */
if (!active) {
RCU_INIT_POINTER(dev->xps_maps, NULL);
kfree_rcu(dev_maps, rcu);
}
out_no_maps:
mutex_unlock(&xps_map_mutex);
return 0;
error:
/* remove any maps that we added */
for_each_possible_cpu(cpu) {
new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
NULL;
if (new_map && new_map != map)
kfree(new_map);
}
mutex_unlock(&xps_map_mutex);
kfree(new_dev_maps);
return -ENOMEM;
}
EXPORT_SYMBOL(netif_set_xps_queue);
#endif
/*
* Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
* greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
*/
int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
{
int rc;
if (txq < 1 || txq > dev->num_tx_queues)
return -EINVAL;
if (dev->reg_state == NETREG_REGISTERED ||
dev->reg_state == NETREG_UNREGISTERING) {
ASSERT_RTNL();
rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
txq);
if (rc)
return rc;
if (dev->num_tc)
netif_setup_tc(dev, txq);
if (txq < dev->real_num_tx_queues) {
qdisc_reset_all_tx_gt(dev, txq);
#ifdef CONFIG_XPS
netif_reset_xps_queues_gt(dev, txq);
#endif
}
}
dev->real_num_tx_queues = txq;
return 0;
}
EXPORT_SYMBOL(netif_set_real_num_tx_queues);
#ifdef CONFIG_SYSFS
/**
* netif_set_real_num_rx_queues - set actual number of RX queues used
* @dev: Network device
* @rxq: Actual number of RX queues
*
* This must be called either with the rtnl_lock held or before
* registration of the net device. Returns 0 on success, or a
* negative error code. If called before registration, it always
* succeeds.
*/
int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
{
int rc;
if (rxq < 1 || rxq > dev->num_rx_queues)
return -EINVAL;
if (dev->reg_state == NETREG_REGISTERED) {
ASSERT_RTNL();
rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
rxq);
if (rc)
return rc;
}
dev->real_num_rx_queues = rxq;
return 0;
}
EXPORT_SYMBOL(netif_set_real_num_rx_queues);
#endif
/**
* netif_get_num_default_rss_queues - default number of RSS queues
*
* This routine should set an upper limit on the number of RSS queues
* used by default by multiqueue devices.
*/
int netif_get_num_default_rss_queues(void)
{
return is_kdump_kernel() ?
1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
}
EXPORT_SYMBOL(netif_get_num_default_rss_queues);
static void __netif_reschedule(struct Qdisc *q)
{
struct softnet_data *sd;
unsigned long flags;
local_irq_save(flags);
sd = this_cpu_ptr(&softnet_data);
q->next_sched = NULL;
*sd->output_queue_tailp = q;
sd->output_queue_tailp = &q->next_sched;
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_restore(flags);
}
void __netif_schedule(struct Qdisc *q)
{
if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
__netif_reschedule(q);
}
EXPORT_SYMBOL(__netif_schedule);
struct dev_kfree_skb_cb {
enum skb_free_reason reason;
};
static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
{
return (struct dev_kfree_skb_cb *)skb->cb;
}
void netif_schedule_queue(struct netdev_queue *txq)
{
rcu_read_lock();
if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
struct Qdisc *q = rcu_dereference(txq->qdisc);
__netif_schedule(q);
}
rcu_read_unlock();
}
EXPORT_SYMBOL(netif_schedule_queue);
/**
* netif_wake_subqueue - allow sending packets on subqueue
* @dev: network device
* @queue_index: sub queue index
*
* Resume individual transmit queue of a device with multiple transmit queues.
*/
void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
{
struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
struct Qdisc *q;
rcu_read_lock();
q = rcu_dereference(txq->qdisc);
__netif_schedule(q);
rcu_read_unlock();
}
}
EXPORT_SYMBOL(netif_wake_subqueue);
void netif_tx_wake_queue(struct netdev_queue *dev_queue)
{
if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
struct Qdisc *q;
rcu_read_lock();
q = rcu_dereference(dev_queue->qdisc);
__netif_schedule(q);
rcu_read_unlock();
}
}
EXPORT_SYMBOL(netif_tx_wake_queue);
void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
{
unsigned long flags;
if (likely(atomic_read(&skb->users) == 1)) {
smp_rmb();
atomic_set(&skb->users, 0);
} else if (likely(!atomic_dec_and_test(&skb->users))) {
return;
}
get_kfree_skb_cb(skb)->reason = reason;
local_irq_save(flags);
skb->next = __this_cpu_read(softnet_data.completion_queue);
__this_cpu_write(softnet_data.completion_queue, skb);
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_restore(flags);
}
EXPORT_SYMBOL(__dev_kfree_skb_irq);
void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
{
if (in_irq() || irqs_disabled())
__dev_kfree_skb_irq(skb, reason);
else
dev_kfree_skb(skb);
}
EXPORT_SYMBOL(__dev_kfree_skb_any);
/**
* netif_device_detach - mark device as removed
* @dev: network device
*
* Mark device as removed from system and therefore no longer available.
*/
void netif_device_detach(struct net_device *dev)
{
if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
netif_running(dev)) {
netif_tx_stop_all_queues(dev);
}
}
EXPORT_SYMBOL(netif_device_detach);
/**
* netif_device_attach - mark device as attached
* @dev: network device
*
* Mark device as attached from system and restart if needed.
*/
void netif_device_attach(struct net_device *dev)
{
if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
netif_running(dev)) {
netif_tx_wake_all_queues(dev);
__netdev_watchdog_up(dev);
}
}
EXPORT_SYMBOL(netif_device_attach);
/*
* Returns a Tx hash based on the given packet descriptor a Tx queues' number
* to be used as a distribution range.
*/
u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
unsigned int num_tx_queues)
{
u32 hash;
u16 qoffset = 0;
u16 qcount = num_tx_queues;
if (skb_rx_queue_recorded(skb)) {
hash = skb_get_rx_queue(skb);
while (unlikely(hash >= num_tx_queues))
hash -= num_tx_queues;
return hash;
}
if (dev->num_tc) {
u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
qoffset = dev->tc_to_txq[tc].offset;
qcount = dev->tc_to_txq[tc].count;
}
return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
}
EXPORT_SYMBOL(__skb_tx_hash);
static void skb_warn_bad_offload(const struct sk_buff *skb)
{
static const netdev_features_t null_features;
struct net_device *dev = skb->dev;
const char *name = "";
if (!net_ratelimit())
return;
if (dev) {
if (dev->dev.parent)
name = dev_driver_string(dev->dev.parent);
else
name = netdev_name(dev);
}
WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
"gso_type=%d ip_summed=%d\n",
name, dev ? &dev->features : &null_features,
skb->sk ? &skb->sk->sk_route_caps : &null_features,
skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
skb_shinfo(skb)->gso_type, skb->ip_summed);
}
/*
* Invalidate hardware checksum when packet is to be mangled, and
* complete checksum manually on outgoing path.
*/
int skb_checksum_help(struct sk_buff *skb)
{
__wsum csum;
int ret = 0, offset;
if (skb->ip_summed == CHECKSUM_COMPLETE)
goto out_set_summed;
if (unlikely(skb_shinfo(skb)->gso_size)) {
skb_warn_bad_offload(skb);
return -EINVAL;
}
/* Before computing a checksum, we should make sure no frag could
* be modified by an external entity : checksum could be wrong.
*/
if (skb_has_shared_frag(skb)) {
ret = __skb_linearize(skb);
if (ret)
goto out;
}
offset = skb_checksum_start_offset(skb);
BUG_ON(offset >= skb_headlen(skb));
csum = skb_checksum(skb, offset, skb->len - offset, 0);
offset += skb->csum_offset;
BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
if (skb_cloned(skb) &&
!skb_clone_writable(skb, offset + sizeof(__sum16))) {
ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
if (ret)
goto out;
}
*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
out_set_summed:
skb->ip_summed = CHECKSUM_NONE;
out:
return ret;
}
EXPORT_SYMBOL(skb_checksum_help);
/* skb_csum_offload_check - Driver helper function to determine if a device
* with limited checksum offload capabilities is able to offload the checksum
* for a given packet.
*
* Arguments:
* skb - sk_buff for the packet in question
* spec - contains the description of what device can offload
* csum_encapped - returns true if the checksum being offloaded is
* encpasulated. That is it is checksum for the transport header
* in the inner headers.
* checksum_help - when set indicates that helper function should
* call skb_checksum_help if offload checks fail
*
* Returns:
* true: Packet has passed the checksum checks and should be offloadable to
* the device (a driver may still need to check for additional
* restrictions of its device)
* false: Checksum is not offloadable. If checksum_help was set then
* skb_checksum_help was called to resolve checksum for non-GSO
* packets and when IP protocol is not SCTP
*/
bool __skb_csum_offload_chk(struct sk_buff *skb,
const struct skb_csum_offl_spec *spec,
bool *csum_encapped,
bool csum_help)
{
struct iphdr *iph;
struct ipv6hdr *ipv6;
void *nhdr;
int protocol;
u8 ip_proto;
if (skb->protocol == htons(ETH_P_8021Q) ||
skb->protocol == htons(ETH_P_8021AD)) {
if (!spec->vlan_okay)
goto need_help;
}
/* We check whether the checksum refers to a transport layer checksum in
* the outermost header or an encapsulated transport layer checksum that
* corresponds to the inner headers of the skb. If the checksum is for
* something else in the packet we need help.
*/
if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
/* Non-encapsulated checksum */
protocol = eproto_to_ipproto(vlan_get_protocol(skb));
nhdr = skb_network_header(skb);
*csum_encapped = false;
if (spec->no_not_encapped)
goto need_help;
} else if (skb->encapsulation && spec->encap_okay &&
skb_checksum_start_offset(skb) ==
skb_inner_transport_offset(skb)) {
/* Encapsulated checksum */
*csum_encapped = true;
switch (skb->inner_protocol_type) {
case ENCAP_TYPE_ETHER:
protocol = eproto_to_ipproto(skb->inner_protocol);
break;
case ENCAP_TYPE_IPPROTO:
protocol = skb->inner_protocol;
break;
}
nhdr = skb_inner_network_header(skb);
} else {
goto need_help;
}
switch (protocol) {
case IPPROTO_IP:
if (!spec->ipv4_okay)
goto need_help;
iph = nhdr;
ip_proto = iph->protocol;
if (iph->ihl != 5 && !spec->ip_options_okay)
goto need_help;
break;
case IPPROTO_IPV6:
if (!spec->ipv6_okay)
goto need_help;
if (spec->no_encapped_ipv6 && *csum_encapped)
goto need_help;
ipv6 = nhdr;
nhdr += sizeof(*ipv6);
ip_proto = ipv6->nexthdr;
break;
default:
goto need_help;
}
ip_proto_again:
switch (ip_proto) {
case IPPROTO_TCP:
if (!spec->tcp_okay ||
skb->csum_offset != offsetof(struct tcphdr, check))
goto need_help;
break;
case IPPROTO_UDP:
if (!spec->udp_okay ||
skb->csum_offset != offsetof(struct udphdr, check))
goto need_help;
break;
case IPPROTO_SCTP:
if (!spec->sctp_okay ||
skb->csum_offset != offsetof(struct sctphdr, checksum))
goto cant_help;
break;
case NEXTHDR_HOP:
case NEXTHDR_ROUTING:
case NEXTHDR_DEST: {
u8 *opthdr = nhdr;
if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
goto need_help;
ip_proto = opthdr[0];
nhdr += (opthdr[1] + 1) << 3;
goto ip_proto_again;
}
default:
goto need_help;
}
/* Passed the tests for offloading checksum */
return true;
need_help:
if (csum_help && !skb_shinfo(skb)->gso_size)
skb_checksum_help(skb);
cant_help:
return false;
}
EXPORT_SYMBOL(__skb_csum_offload_chk);
__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
__be16 type = skb->protocol;
/* Tunnel gso handlers can set protocol to ethernet. */
if (type == htons(ETH_P_TEB)) {
struct ethhdr *eth;
if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
return 0;
eth = (struct ethhdr *)skb_mac_header(skb);
type = eth->h_proto;
}
return __vlan_get_protocol(skb, type, depth);
}
/**
* skb_mac_gso_segment - mac layer segmentation handler.
* @skb: buffer to segment
* @features: features for the output path (see dev->features)
*/
struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
struct packet_offload *ptype;
int vlan_depth = skb->mac_len;
__be16 type = skb_network_protocol(skb, &vlan_depth);
if (unlikely(!type))
return ERR_PTR(-EINVAL);
__skb_pull(skb, vlan_depth);
rcu_read_lock();
list_for_each_entry_rcu(ptype, &offload_base, list) {
if (ptype->type == type && ptype->callbacks.gso_segment) {
segs = ptype->callbacks.gso_segment(skb, features);
break;
}
}
rcu_read_unlock();
__skb_push(skb, skb->data - skb_mac_header(skb));
return segs;
}
EXPORT_SYMBOL(skb_mac_gso_segment);
/* openvswitch calls this on rx path, so we need a different check.
*/
static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
{
if (tx_path)
return skb->ip_summed != CHECKSUM_PARTIAL;
else
return skb->ip_summed == CHECKSUM_NONE;
}
/**
* __skb_gso_segment - Perform segmentation on skb.
* @skb: buffer to segment
* @features: features for the output path (see dev->features)
* @tx_path: whether it is called in TX path
*
* This function segments the given skb and returns a list of segments.
*
* It may return NULL if the skb requires no segmentation. This is
* only possible when GSO is used for verifying header integrity.
*
* Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
*/
struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
netdev_features_t features, bool tx_path)
{
if (unlikely(skb_needs_check(skb, tx_path))) {
int err;
skb_warn_bad_offload(skb);
err = skb_cow_head(skb, 0);
if (err < 0)
return ERR_PTR(err);
}
/* Only report GSO partial support if it will enable us to
* support segmentation on this frame without needing additional
* work.
*/
if (features & NETIF_F_GSO_PARTIAL) {
netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
struct net_device *dev = skb->dev;
partial_features |= dev->features & dev->gso_partial_features;
if (!skb_gso_ok(skb, features | partial_features))
features &= ~NETIF_F_GSO_PARTIAL;
}
BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
SKB_GSO_CB(skb)->encap_level = 0;
skb_reset_mac_header(skb);
skb_reset_mac_len(skb);
return skb_mac_gso_segment(skb, features);
}
EXPORT_SYMBOL(__skb_gso_segment);
/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
void netdev_rx_csum_fault(struct net_device *dev)
{
if (net_ratelimit()) {
pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
dump_stack();
}
}
EXPORT_SYMBOL(netdev_rx_csum_fault);
#endif
/* Actually, we should eliminate this check as soon as we know, that:
* 1. IOMMU is present and allows to map all the memory.
* 2. No high memory really exists on this machine.
*/
static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_HIGHMEM
int i;
if (!(dev->features & NETIF_F_HIGHDMA)) {
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
if (PageHighMem(skb_frag_page(frag)))
return 1;
}
}
if (PCI_DMA_BUS_IS_PHYS) {
struct device *pdev = dev->dev.parent;
if (!pdev)
return 0;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
dma_addr_t addr = page_to_phys(skb_frag_page(frag));
if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
return 1;
}
}
#endif
return 0;
}
/* If MPLS offload request, verify we are testing hardware MPLS features
* instead of standard features for the netdev.
*/
#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
static netdev_features_t net_mpls_features(struct sk_buff *skb,
netdev_features_t features,
__be16 type)
{
if (eth_p_mpls(type))
features &= skb->dev->mpls_features;
return features;
}
#else
static netdev_features_t net_mpls_features(struct sk_buff *skb,
netdev_features_t features,
__be16 type)
{
return features;
}
#endif
static netdev_features_t harmonize_features(struct sk_buff *skb,
netdev_features_t features)
{
int tmp;
__be16 type;
type = skb_network_protocol(skb, &tmp);
features = net_mpls_features(skb, features, type);
if (skb->ip_summed != CHECKSUM_NONE &&
!can_checksum_protocol(features, type)) {
features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
}
if (illegal_highdma(skb->dev, skb))
features &= ~NETIF_F_SG;
return features;
}
netdev_features_t passthru_features_check(struct sk_buff *skb,
struct net_device *dev,
netdev_features_t features)
{
return features;
}
EXPORT_SYMBOL(passthru_features_check);
static netdev_features_t dflt_features_check(const struct sk_buff *skb,
struct net_device *dev,
netdev_features_t features)
{
return vlan_features_check(skb, features);
}
static netdev_features_t gso_features_check(const struct sk_buff *skb,
struct net_device *dev,
netdev_features_t features)
{
u16 gso_segs = skb_shinfo(skb)->gso_segs;
if (gso_segs > dev->gso_max_segs)
return features & ~NETIF_F_GSO_MASK;
/* Support for GSO partial features requires software
* intervention before we can actually process the packets
* so we need to strip support for any partial features now
* and we can pull them back in after we have partially
* segmented the frame.
*/
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
features &= ~dev->gso_partial_features;
/* Make sure to clear the IPv4 ID mangling feature if the
* IPv4 header has the potential to be fragmented.
*/
if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
struct iphdr *iph = skb->encapsulation ?
inner_ip_hdr(skb) : ip_hdr(skb);
if (!(iph->frag_off & htons(IP_DF)))
features &= ~NETIF_F_TSO_MANGLEID;
}
return features;
}
netdev_features_t netif_skb_features(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
netdev_features_t features = dev->features;
if (skb_is_gso(skb))
features = gso_features_check(skb, dev, features);
/* If encapsulation offload request, verify we are testing
* hardware encapsulation features instead of standard
* features for the netdev
*/
if (skb->encapsulation)
features &= dev->hw_enc_features;
if (skb_vlan_tagged(skb))
features = netdev_intersect_features(features,
dev->vlan_features |
NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX);
if (dev->netdev_ops->ndo_features_check)
features &= dev->netdev_ops->ndo_features_check(skb, dev,
features);
else
features &= dflt_features_check(skb, dev, features);
return harmonize_features(skb, features);
}
EXPORT_SYMBOL(netif_skb_features);
static int xmit_one(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq, bool more)
{
unsigned int len;
int rc;
if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
dev_queue_xmit_nit(skb, dev);
len = skb->len;
trace_net_dev_start_xmit(skb, dev);
rc = netdev_start_xmit(skb, dev, txq, more);
trace_net_dev_xmit(skb, rc, dev, len);
return rc;
}
struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
struct netdev_queue *txq, int *ret)
{
struct sk_buff *skb = first;
int rc = NETDEV_TX_OK;
while (skb) {
struct sk_buff *next = skb->next;
skb->next = NULL;
rc = xmit_one(skb, dev, txq, next != NULL);
if (unlikely(!dev_xmit_complete(rc))) {
skb->next = next;
goto out;
}
skb = next;
if (netif_xmit_stopped(txq) && skb) {
rc = NETDEV_TX_BUSY;
break;
}
}
out:
*ret = rc;
return skb;
}
static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
netdev_features_t features)
{
if (skb_vlan_tag_present(skb) &&
!vlan_hw_offload_capable(features, skb->vlan_proto))
skb = __vlan_hwaccel_push_inside(skb);
return skb;
}
static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
{
netdev_features_t features;
features = netif_skb_features(skb);
skb = validate_xmit_vlan(skb, features);
if (unlikely(!skb))
goto out_null;
if (netif_needs_gso(skb, features)) {
struct sk_buff *segs;
segs = skb_gso_segment(skb, features);
if (IS_ERR(segs)) {
goto out_kfree_skb;
} else if (segs) {
consume_skb(skb);
skb = segs;
}
} else {
if (skb_needs_linearize(skb, features) &&
__skb_linearize(skb))
goto out_kfree_skb;
/* If packet is not checksummed and device does not
* support checksumming for this protocol, complete
* checksumming here.
*/
if (skb->ip_summed == CHECKSUM_PARTIAL) {
if (skb->encapsulation)
skb_set_inner_transport_header(skb,
skb_checksum_start_offset(skb));
else
skb_set_transport_header(skb,
skb_checksum_start_offset(skb));
if (!(features & NETIF_F_CSUM_MASK) &&
skb_checksum_help(skb))
goto out_kfree_skb;
}
}
return skb;
out_kfree_skb:
kfree_skb(skb);
out_null:
atomic_long_inc(&dev->tx_dropped);
return NULL;
}
struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
{
struct sk_buff *next, *head = NULL, *tail;
for (; skb != NULL; skb = next) {
next = skb->next;
skb->next = NULL;
/* in case skb wont be segmented, point to itself */
skb->prev = skb;
skb = validate_xmit_skb(skb, dev);
if (!skb)
continue;
if (!head)
head = skb;
else
tail->next = skb;
/* If skb was segmented, skb->prev points to
* the last segment. If not, it still contains skb.
*/
tail = skb->prev;
}
return head;
}
EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
static void qdisc_pkt_len_init(struct sk_buff *skb)
{
const struct skb_shared_info *shinfo = skb_shinfo(skb);
qdisc_skb_cb(skb)->pkt_len = skb->len;
/* To get more precise estimation of bytes sent on wire,
* we add to pkt_len the headers size of all segments
*/
if (shinfo->gso_size) {
unsigned int hdr_len;
u16 gso_segs = shinfo->gso_segs;
/* mac layer + network layer */
hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
/* + transport layer */
if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
hdr_len += tcp_hdrlen(skb);
else
hdr_len += sizeof(struct udphdr);
if (shinfo->gso_type & SKB_GSO_DODGY)
gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
shinfo->gso_size);
qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
}
}
static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev,
struct netdev_queue *txq)
{
spinlock_t *root_lock = qdisc_lock(q);
struct sk_buff *to_free = NULL;
bool contended;
int rc;
qdisc_calculate_pkt_len(skb, q);
/*
* Heuristic to force contended enqueues to serialize on a
* separate lock before trying to get qdisc main lock.
* This permits qdisc->running owner to get the lock more
* often and dequeue packets faster.
*/
contended = qdisc_is_running(q);
if (unlikely(contended))
spin_lock(&q->busylock);
spin_lock(root_lock);
if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
__qdisc_drop(skb, &to_free);
rc = NET_XMIT_DROP;
} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
qdisc_run_begin(q)) {
/*
* This is a work-conserving queue; there are no old skbs
* waiting to be sent out; and the qdisc is not running -
* xmit the skb directly.
*/
qdisc_bstats_update(q, skb);
if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
contended = false;
}
__qdisc_run(q);
} else
qdisc_run_end(q);
rc = NET_XMIT_SUCCESS;
} else {
rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
contended = false;
}
__qdisc_run(q);
}
}
spin_unlock(root_lock);
if (unlikely(to_free))
kfree_skb_list(to_free);
if (unlikely(contended))
spin_unlock(&q->busylock);
return rc;
}
#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
static void skb_update_prio(struct sk_buff *skb)
{
struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
if (!skb->priority && skb->sk && map) {
unsigned int prioidx =
sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
if (prioidx < map->priomap_len)
skb->priority = map->priomap[prioidx];
}
}
#else
#define skb_update_prio(skb)
#endif
DEFINE_PER_CPU(int, xmit_recursion);
EXPORT_SYMBOL(xmit_recursion);
/**
* dev_loopback_xmit - loop back @skb
* @net: network namespace this loopback is happening in
* @sk: sk needed to be a netfilter okfn
* @skb: buffer to transmit
*/
int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
skb_reset_mac_header(skb);
__skb_pull(skb, skb_network_offset(skb));
skb->pkt_type = PACKET_LOOPBACK;
skb->ip_summed = CHECKSUM_UNNECESSARY;
WARN_ON(!skb_dst(skb));
skb_dst_force(skb);
netif_rx_ni(skb);
return 0;
}
EXPORT_SYMBOL(dev_loopback_xmit);
#ifdef CONFIG_NET_EGRESS
static struct sk_buff *
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
struct tcf_result cl_res;
if (!cl)
return skb;
/* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
* earlier by the caller.
*/
qdisc_bstats_cpu_update(cl->q, skb);
switch (tc_classify(skb, cl, &cl_res, false)) {
case TC_ACT_OK:
case TC_ACT_RECLASSIFY:
skb->tc_index = TC_H_MIN(cl_res.classid);
break;
case TC_ACT_SHOT:
qdisc_qstats_cpu_drop(cl->q);
*ret = NET_XMIT_DROP;
kfree_skb(skb);
return NULL;
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
*ret = NET_XMIT_SUCCESS;
consume_skb(skb);
return NULL;
case TC_ACT_REDIRECT:
/* No need to push/pop skb's mac_header here on egress! */
skb_do_redirect(skb);
*ret = NET_XMIT_SUCCESS;
return NULL;
default:
break;
}
return skb;
}
#endif /* CONFIG_NET_EGRESS */
static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_XPS
struct xps_dev_maps *dev_maps;
struct xps_map *map;
int queue_index = -1;
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_maps);
if (dev_maps) {
map = rcu_dereference(
dev_maps->cpu_map[skb->sender_cpu - 1]);
if (map) {
if (map->len == 1)
queue_index = map->queues[0];
else
queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
map->len)];
if (unlikely(queue_index >= dev->real_num_tx_queues))
queue_index = -1;
}
}
rcu_read_unlock();
return queue_index;
#else
return -1;
#endif
}
static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
{
struct sock *sk = skb->sk;
int queue_index = sk_tx_queue_get(sk);
if (queue_index < 0 || skb->ooo_okay ||
queue_index >= dev->real_num_tx_queues) {
int new_index = get_xps_queue(dev, skb);
if (new_index < 0)
new_index = skb_tx_hash(dev, skb);
if (queue_index != new_index && sk &&
sk_fullsock(sk) &&
rcu_access_pointer(sk->sk_dst_cache))
sk_tx_queue_set(sk, new_index);
queue_index = new_index;
}
return queue_index;
}
struct netdev_queue *netdev_pick_tx(struct net_device *dev,
struct sk_buff *skb,
void *accel_priv)
{
int queue_index = 0;
#ifdef CONFIG_XPS
u32 sender_cpu = skb->sender_cpu - 1;
if (sender_cpu >= (u32)NR_CPUS)
skb->sender_cpu = raw_smp_processor_id() + 1;
#endif
if (dev->real_num_tx_queues != 1) {
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_select_queue)
queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
__netdev_pick_tx);
else
queue_index = __netdev_pick_tx(dev, skb);
if (!accel_priv)
queue_index = netdev_cap_txqueue(dev, queue_index);
}
skb_set_queue_mapping(skb, queue_index);
return netdev_get_tx_queue(dev, queue_index);
}
/**
* __dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
* @accel_priv: private data used for L2 forwarding offload
*
* Queue a buffer for transmission to a network device. The caller must
* have set the device and priority and built the buffer before calling
* this function. The function can be called from an interrupt.
*
* A negative errno code is returned on a failure. A success does not
* guarantee the frame will be transmitted as it may be dropped due
* to congestion or traffic shaping.
*
* -----------------------------------------------------------------------------------
* I notice this method can also return errors from the queue disciplines,
* including NET_XMIT_DROP, which is a positive value. So, errors can also
* be positive.
*
* Regardless of the return value, the skb is consumed, so it is currently
* difficult to retry a send to this method. (You can bump the ref count
* before sending to hold a reference for retry if you are careful.)
*
* When calling this method, interrupts MUST be enabled. This is because
* the BH enable code must have IRQs enabled so that it will not deadlock.
* --BLG
*/
static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
struct Qdisc *q;
int rc = -ENOMEM;
skb_reset_mac_header(skb);
if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
*/
rcu_read_lock_bh();
skb_update_prio(skb);
qdisc_pkt_len_init(skb);
#ifdef CONFIG_NET_CLS_ACT
skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
# ifdef CONFIG_NET_EGRESS
if (static_key_false(&egress_needed)) {
skb = sch_handle_egress(skb, &rc, dev);
if (!skb)
goto out;
}
# endif
#endif
/* If device/qdisc don't need skb->dst, release it right now while
* its hot in this cpu cache.
*/
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
skb_dst_drop(skb);
else
skb_dst_force(skb);
txq = netdev_pick_tx(dev, skb, accel_priv);
q = rcu_dereference_bh(txq->qdisc);
trace_net_dev_queue(skb);
if (q->enqueue) {
rc = __dev_xmit_skb(skb, q, dev, txq);
goto out;
}
/* The device has no queue. Common case for software devices:
loopback, all the sorts of tunnels...
Really, it is unlikely that netif_tx_lock protection is necessary
here. (f.e. loopback and IP tunnels are clean ignoring statistics
counters.)
However, it is possible, that they rely on protection
made by us here.
Check this and shot the lock. It is not prone from deadlocks.
Either shot noqueue qdisc, it is even simpler 8)
*/
if (dev->flags & IFF_UP) {
int cpu = smp_processor_id(); /* ok because BHs are off */
if (txq->xmit_lock_owner != cpu) {
if (unlikely(__this_cpu_read(xmit_recursion) >
XMIT_RECURSION_LIMIT))
goto recursion_alert;
skb = validate_xmit_skb(skb, dev);
if (!skb)
goto out;
HARD_TX_LOCK(dev, txq, cpu);
if (!netif_xmit_stopped(txq)) {
__this_cpu_inc(xmit_recursion);
skb = dev_hard_start_xmit(skb, dev, txq, &rc);
__this_cpu_dec(xmit_recursion);
if (dev_xmit_complete(rc)) {
HARD_TX_UNLOCK(dev, txq);
goto out;
}
}
HARD_TX_UNLOCK(dev, txq);
net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
dev->name);
} else {
/* Recursion is detected! It is possible,
* unfortunately
*/
recursion_alert:
net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
dev->name);
}
}
rc = -ENETDOWN;
rcu_read_unlock_bh();
atomic_long_inc(&dev->tx_dropped);
kfree_skb_list(skb);
return rc;
out:
rcu_read_unlock_bh();
return rc;
}
int dev_queue_xmit(struct sk_buff *skb)
{
return __dev_queue_xmit(skb, NULL);
}
EXPORT_SYMBOL(dev_queue_xmit);
int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
{
return __dev_queue_xmit(skb, accel_priv);
}
EXPORT_SYMBOL(dev_queue_xmit_accel);
/*=======================================================================
Receiver routines
=======================================================================*/
int netdev_max_backlog __read_mostly = 1000;
EXPORT_SYMBOL(netdev_max_backlog);
int netdev_tstamp_prequeue __read_mostly = 1;
int netdev_budget __read_mostly = 300;
int weight_p __read_mostly = 64; /* old backlog weight */
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
list_add_tail(&napi->poll_list, &sd->poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
#ifdef CONFIG_RPS
/* One global table that all flow-based protocols share. */
struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
EXPORT_SYMBOL(rps_sock_flow_table);
u32 rps_cpu_mask __read_mostly;
EXPORT_SYMBOL(rps_cpu_mask);
struct static_key rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow *rflow, u16 next_cpu)
{
if (next_cpu < nr_cpu_ids) {
#ifdef CONFIG_RFS_ACCEL
struct netdev_rx_queue *rxqueue;
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *old_rflow;
u32 flow_id;
u16 rxq_index;
int rc;
/* Should we steer this flow to a different hardware queue? */
if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
!(dev->features & NETIF_F_NTUPLE))
goto out;
rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
if (rxq_index == skb_get_rx_queue(skb))
goto out;
rxqueue = dev->_rx + rxq_index;
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (!flow_table)
goto out;
flow_id = skb_get_hash(skb) & flow_table->mask;
rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
rxq_index, flow_id);
if (rc < 0)
goto out;
old_rflow = rflow;
rflow = &flow_table->flows[flow_id];
rflow->filter = rc;
if (old_rflow->filter == rflow->filter)
old_rflow->filter = RPS_NO_FILTER;
out:
#endif
rflow->last_qtail =
per_cpu(softnet_data, next_cpu).input_queue_head;
}
rflow->cpu = next_cpu;
return rflow;
}
/*
* get_rps_cpu is called from netif_receive_skb and returns the target
* CPU from the RPS map of the receiving queue for a given skb.
* rcu_read_lock must be held on entry.
*/
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow **rflowp)
{
const struct rps_sock_flow_table *sock_flow_table;
struct netdev_rx_queue *rxqueue = dev->_rx;
struct rps_dev_flow_table *flow_table;
struct rps_map *map;
int cpu = -1;
u32 tcpu;
u32 hash;
if (skb_rx_queue_recorded(skb)) {
u16 index = skb_get_rx_queue(skb);
if (unlikely(index >= dev->real_num_rx_queues)) {
WARN_ONCE(dev->real_num_rx_queues > 1,
"%s received packet on queue %u, but number "
"of RX queues is %u\n",
dev->name, index, dev->real_num_rx_queues);
goto done;
}
rxqueue += index;
}
/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
flow_table = rcu_dereference(rxqueue->rps_flow_table);
map = rcu_dereference(rxqueue->rps_map);
if (!flow_table && !map)
goto done;
skb_reset_network_header(skb);
hash = skb_get_hash(skb);
if (!hash)
goto done;
sock_flow_table = rcu_dereference(rps_sock_flow_table);
if (flow_table && sock_flow_table) {
struct rps_dev_flow *rflow;
u32 next_cpu;
u32 ident;
/* First check into global flow table if there is a match */
ident = sock_flow_table->ents[hash & sock_flow_table->mask];
if ((ident ^ hash) & ~rps_cpu_mask)
goto try_rps;
next_cpu = ident & rps_cpu_mask;
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
*/
rflow = &flow_table->flows[hash & flow_table->mask];
tcpu = rflow->cpu;
/*
* If the desired CPU (where last recvmsg was done) is
* different from current CPU (one in the rx-queue flow
* table entry), switch if one of the following holds:
* - Current CPU is unset (>= nr_cpu_ids).
* - Current CPU is offline.
* - The current CPU's queue tail has advanced beyond the
* last packet that was enqueued using this table entry.
* This guarantees that all previous packets for the flow
* have been dequeued, thus preserving in order delivery.
*/
if (unlikely(tcpu != next_cpu) &&
(tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
((int)(per_cpu(softnet_data, tcpu).input_queue_head -
rflow->last_qtail)) >= 0)) {
tcpu = next_cpu;
rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
}
if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
*rflowp = rflow;
cpu = tcpu;
goto done;
}
}
try_rps:
if (map) {
tcpu = map->cpus[reciprocal_scale(hash, map->len)];
if (cpu_online(tcpu)) {
cpu = tcpu;
goto done;
}
}
done:
return cpu;
}
#ifdef CONFIG_RFS_ACCEL
/**
* rps_may_expire_flow - check whether an RFS hardware filter may be removed
* @dev: Device on which the filter was set
* @rxq_index: RX queue index
* @flow_id: Flow ID passed to ndo_rx_flow_steer()
* @filter_id: Filter ID returned by ndo_rx_flow_steer()
*
* Drivers that implement ndo_rx_flow_steer() should periodically call
* this function for each installed filter and remove the filters for
* which it returns %true.
*/
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
u32 flow_id, u16 filter_id)
{
struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *rflow;
bool expire = true;
unsigned int cpu;
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (flow_table && flow_id <= flow_table->mask) {
rflow = &flow_table->flows[flow_id];
cpu = ACCESS_ONCE(rflow->cpu);
if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
((int)(per_cpu(softnet_data, cpu).input_queue_head -
rflow->last_qtail) <
(int)(10 * flow_table->mask)))
expire = false;
}
rcu_read_unlock();
return expire;
}
EXPORT_SYMBOL(rps_may_expire_flow);
#endif /* CONFIG_RFS_ACCEL */
/* Called from hardirq (IPI) context */
static void rps_trigger_softirq(void *data)
{
struct softnet_data *sd = data;
____napi_schedule(sd, &sd->backlog);
sd->received_rps++;
}
#endif /* CONFIG_RPS */
/*
* Check if this softnet_data structure is another cpu one
* If yes, queue it to our IPI list and return 1
* If no, return 0
*/
static int rps_ipi_queued(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
if (sd != mysd) {
sd->rps_ipi_next = mysd->rps_ipi_list;
mysd->rps_ipi_list = sd;
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
return 1;
}
#endif /* CONFIG_RPS */
return 0;
}
#ifdef CONFIG_NET_FLOW_LIMIT
int netdev_flow_limit_table_len __read_mostly = (1 << 12);
#endif
static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
{
#ifdef CONFIG_NET_FLOW_LIMIT
struct sd_flow_limit *fl;
struct softnet_data *sd;
unsigned int old_flow, new_flow;
if (qlen < (netdev_max_backlog >> 1))
return false;
sd = this_cpu_ptr(&softnet_data);
rcu_read_lock();
fl = rcu_dereference(sd->flow_limit);
if (fl) {
new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
old_flow = fl->history[fl->history_head];
fl->history[fl->history_head] = new_flow;
fl->history_head++;
fl->history_head &= FLOW_LIMIT_HISTORY - 1;
if (likely(fl->buckets[old_flow]))
fl->buckets[old_flow]--;
if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
fl->count++;
rcu_read_unlock();
return true;
}
}
rcu_read_unlock();
#endif
return false;
}
/*
* enqueue_to_backlog is called to queue an skb to a per CPU backlog
* queue (may be a remote CPU queue).
*/
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
unsigned int *qtail)
{
struct softnet_data *sd;
unsigned long flags;
unsigned int qlen;
sd = &per_cpu(softnet_data, cpu);
local_irq_save(flags);
rps_lock(sd);
if (!netif_running(skb->dev))
goto drop;
qlen = skb_queue_len(&sd->input_pkt_queue);
if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
if (qlen) {
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
input_queue_tail_incr_save(sd, qtail);
rps_unlock(sd);
local_irq_restore(flags);
return NET_RX_SUCCESS;
}
/* Schedule NAPI for backlog device
* We can use non atomic operation since we own the queue lock
*/
if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
if (!rps_ipi_queued(sd))
____napi_schedule(sd, &sd->backlog);
}
goto enqueue;
}
drop:
sd->dropped++;
rps_unlock(sd);
local_irq_restore(flags);
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
static int netif_rx_internal(struct sk_buff *skb)
{
int ret;
net_timestamp_check(netdev_tstamp_prequeue, skb);
trace_netif_rx(skb);
#ifdef CONFIG_RPS
if (static_key_false(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu;
preempt_disable();
rcu_read_lock();
cpu = get_rps_cpu(skb->dev, skb, &rflow);
if (cpu < 0)
cpu = smp_processor_id();
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
preempt_enable();
} else
#endif
{
unsigned int qtail;
ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
put_cpu();
}
return ret;
}
/**
* netif_rx - post buffer to the network code
* @skb: buffer to post
*
* This function receives a packet from a device driver and queues it for
* the upper (protocol) levels to process. It always succeeds. The buffer
* may be dropped during processing for congestion control or by the
* protocol layers.
*
* return values:
* NET_RX_SUCCESS (no congestion)
* NET_RX_DROP (packet was dropped)
*
*/
int netif_rx(struct sk_buff *skb)
{
trace_netif_rx_entry(skb);
return netif_rx_internal(skb);
}
EXPORT_SYMBOL(netif_rx);
int netif_rx_ni(struct sk_buff *skb)
{
int err;
trace_netif_rx_ni_entry(skb);
preempt_disable();
err = netif_rx_internal(skb);
if (local_softirq_pending())
do_softirq();
preempt_enable();
return err;
}
EXPORT_SYMBOL(netif_rx_ni);
static __latent_entropy void net_tx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
if (sd->completion_queue) {
struct sk_buff *clist;
local_irq_disable();
clist = sd->completion_queue;
sd->completion_queue = NULL;
local_irq_enable();
while (clist) {
struct sk_buff *skb = clist;
clist = clist->next;
WARN_ON(atomic_read(&skb->users));
if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
trace_consume_skb(skb);
else
trace_kfree_skb(skb, net_tx_action);
if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
__kfree_skb(skb);
else
__kfree_skb_defer(skb);
}
__kfree_skb_flush();
}
if (sd->output_queue) {
struct Qdisc *head;
local_irq_disable();
head = sd->output_queue;
sd->output_queue = NULL;
sd->output_queue_tailp = &sd->output_queue;
local_irq_enable();
while (head) {
struct Qdisc *q = head;
spinlock_t *root_lock;
head = head->next_sched;
root_lock = qdisc_lock(q);
spin_lock(root_lock);
/* We need to make sure head->next_sched is read
* before clearing __QDISC_STATE_SCHED
*/
smp_mb__before_atomic();
clear_bit(__QDISC_STATE_SCHED, &q->state);
qdisc_run(q);
spin_unlock(root_lock);
}
}
}
#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
/* This hook is defined here for ATM LANE */
int (*br_fdb_test_addr_hook)(struct net_device *dev,
unsigned char *addr) __read_mostly;
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
#endif
static inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
struct net_device *orig_dev)
{
#ifdef CONFIG_NET_CLS_ACT
struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
struct tcf_result cl_res;
/* If there's at least one ingress present somewhere (so
* we get here via enabled static key), remaining devices
* that are not configured with an ingress qdisc will bail
* out here.
*/
if (!cl)
return skb;
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
qdisc_skb_cb(skb)->pkt_len = skb->len;
skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
qdisc_bstats_cpu_update(cl->q, skb);
switch (tc_classify(skb, cl, &cl_res, false)) {
case TC_ACT_OK:
case TC_ACT_RECLASSIFY:
skb->tc_index = TC_H_MIN(cl_res.classid);
break;
case TC_ACT_SHOT:
qdisc_qstats_cpu_drop(cl->q);
kfree_skb(skb);
return NULL;
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
consume_skb(skb);
return NULL;
case TC_ACT_REDIRECT:
/* skb_mac_header check was done by cls/act_bpf, so
* we can safely push the L2 header back before
* redirecting to another netdev
*/
__skb_push(skb, skb->mac_len);
skb_do_redirect(skb);
return NULL;
default:
break;
}
#endif /* CONFIG_NET_CLS_ACT */
return skb;
}
/**
* netdev_is_rx_handler_busy - check if receive handler is registered
* @dev: device to check
*
* Check if a receive handler is already registered for a given device.
* Return true if there one.
*
* The caller must hold the rtnl_mutex.
*/
bool netdev_is_rx_handler_busy(struct net_device *dev)
{
ASSERT_RTNL();
return dev && rtnl_dereference(dev->rx_handler);
}
EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
/**
* netdev_rx_handler_register - register receive handler
* @dev: device to register a handler for
* @rx_handler: receive handler to register
* @rx_handler_data: data pointer that is used by rx handler
*
* Register a receive handler for a device. This handler will then be
* called from __netif_receive_skb. A negative errno code is returned
* on a failure.
*
* The caller must hold the rtnl_mutex.
*
* For a general description of rx_handler, see enum rx_handler_result.
*/
int netdev_rx_handler_register(struct net_device *dev,
rx_handler_func_t *rx_handler,
void *rx_handler_data)
{
ASSERT_RTNL();
if (dev->rx_handler)
return -EBUSY;
/* Note: rx_handler_data must be set before rx_handler */
rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
rcu_assign_pointer(dev->rx_handler, rx_handler);
return 0;
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
/**
* netdev_rx_handler_unregister - unregister receive handler
* @dev: device to unregister a handler from
*
* Unregister a receive handler from a device.
*
* The caller must hold the rtnl_mutex.
*/
void netdev_rx_handler_unregister(struct net_device *dev)
{
ASSERT_RTNL();
RCU_INIT_POINTER(dev->rx_handler, NULL);
/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
* section has a guarantee to see a non NULL rx_handler_data
* as well.
*/
synchronize_net();
RCU_INIT_POINTER(dev->rx_handler_data, NULL);
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
/*
* Limit the use of PFMEMALLOC reserves to those protocols that implement
* the special handling of PFMEMALLOC skbs.
*/
static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
{
switch (skb->protocol) {
case htons(ETH_P_ARP):
case htons(ETH_P_IP):
case htons(ETH_P_IPV6):
case htons(ETH_P_8021Q):
case htons(ETH_P_8021AD):
return true;
default:
return false;
}
}
static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
int *ret, struct net_device *orig_dev)
{
#ifdef CONFIG_NETFILTER_INGRESS
if (nf_hook_ingress_active(skb)) {
int ingress_retval;
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
rcu_read_lock();
ingress_retval = nf_hook_ingress(skb);
rcu_read_unlock();
return ingress_retval;
}
#endif /* CONFIG_NETFILTER_INGRESS */
return 0;
}
static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
struct net_device *orig_dev;
bool deliver_exact = false;
int ret = NET_RX_DROP;
__be16 type;
net_timestamp_check(!netdev_tstamp_prequeue, skb);
trace_netif_receive_skb(skb);
orig_dev = skb->dev;
skb_reset_network_header(skb);
if (!skb_transport_header_was_set(skb))
skb_reset_transport_header(skb);
skb_reset_mac_len(skb);
pt_prev = NULL;
another_round:
skb->skb_iif = skb->dev->ifindex;
__this_cpu_inc(softnet_data.processed);
if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
skb = skb_vlan_untag(skb);
if (unlikely(!skb))
goto out;
}
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
goto ncls;
}
#endif
if (pfmemalloc)
goto skip_taps;
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
skip_taps:
#ifdef CONFIG_NET_INGRESS
if (static_key_false(&ingress_needed)) {
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
goto out;
}
#endif
#ifdef CONFIG_NET_CLS_ACT
skb->tc_verd = 0;
ncls:
#endif
if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
goto drop;
if (skb_vlan_tag_present(skb)) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
if (vlan_do_receive(&skb))
goto another_round;
else if (unlikely(!skb))
goto out;
}
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (rx_handler) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
switch (rx_handler(&skb)) {
case RX_HANDLER_CONSUMED:
ret = NET_RX_SUCCESS;
goto out;
case RX_HANDLER_ANOTHER:
goto another_round;
case RX_HANDLER_EXACT:
deliver_exact = true;
case RX_HANDLER_PASS:
break;
default:
BUG();
}
}
if (unlikely(skb_vlan_tag_present(skb))) {
if (skb_vlan_tag_get_id(skb))
skb->pkt_type = PACKET_OTHERHOST;
/* Note: we might in the future use prio bits
* and set skb->priority like in vlan_do_receive()
* For the time being, just ignore Priority Code Point
*/
skb->vlan_tci = 0;
}
type = skb->protocol;
/* deliver only exact match when indicated */
if (likely(!deliver_exact)) {
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&ptype_base[ntohs(type) &
PTYPE_HASH_MASK]);
}
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&orig_dev->ptype_specific);
if (unlikely(skb->dev != orig_dev)) {
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&skb->dev->ptype_specific);
}
if (pt_prev) {
if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
goto drop;
else
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
} else {
drop:
if (!deliver_exact)
atomic_long_inc(&skb->dev->rx_dropped);
else
atomic_long_inc(&skb->dev->rx_nohandler);
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
ret = NET_RX_DROP;
}
out:
return ret;
}
static int __netif_receive_skb(struct sk_buff *skb)
{
int ret;
if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
unsigned long pflags = current->flags;
/*
* PFMEMALLOC skbs are special, they should
* - be delivered to SOCK_MEMALLOC sockets only
* - stay away from userspace
* - have bounded memory usage
*
* Use PF_MEMALLOC as this saves us from propagating the allocation
* context down to all allocation sites.
*/
current->flags |= PF_MEMALLOC;
ret = __netif_receive_skb_core(skb, true);
tsk_restore_flags(current, pflags, PF_MEMALLOC);
} else
ret = __netif_receive_skb_core(skb, false);
return ret;
}
static int netif_receive_skb_internal(struct sk_buff *skb)
{
int ret;
net_timestamp_check(netdev_tstamp_prequeue, skb);
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
rcu_read_lock();
#ifdef CONFIG_RPS
if (static_key_false(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu = get_rps_cpu(skb->dev, skb, &rflow);
if (cpu >= 0) {
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
return ret;
}
}
#endif
ret = __netif_receive_skb(skb);
rcu_read_unlock();
return ret;
}
/**
* netif_receive_skb - process receive buffer from network
* @skb: buffer to process
*
* netif_receive_skb() is the main receive data processing function.
* It always succeeds. The buffer may be dropped during processing
* for congestion control or by the protocol layers.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
*
* Return values (usually ignored):
* NET_RX_SUCCESS: no congestion
* NET_RX_DROP: packet was dropped
*/
int netif_receive_skb(struct sk_buff *skb)
{
trace_netif_receive_skb_entry(skb);
return netif_receive_skb_internal(skb);
}
EXPORT_SYMBOL(netif_receive_skb);
DEFINE_PER_CPU(struct work_struct, flush_works);
/* Network device is going away, flush any packets still pending */
static void flush_backlog(struct work_struct *work)
{
struct sk_buff *skb, *tmp;
struct softnet_data *sd;
local_bh_disable();
sd = this_cpu_ptr(&softnet_data);
local_irq_disable();
rps_lock(sd);
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->input_pkt_queue);
kfree_skb(skb);
input_queue_head_incr(sd);
}
}
rps_unlock(sd);
local_irq_enable();
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->process_queue);
kfree_skb(skb);
input_queue_head_incr(sd);
}
}
local_bh_enable();
}
static void flush_all_backlogs(void)
{
unsigned int cpu;
get_online_cpus();
for_each_online_cpu(cpu)
queue_work_on(cpu, system_highpri_wq,
per_cpu_ptr(&flush_works, cpu));
for_each_online_cpu(cpu)
flush_work(per_cpu_ptr(&flush_works, cpu));
put_online_cpus();
}
static int napi_gro_complete(struct sk_buff *skb)
{
struct packet_offload *ptype;
__be16 type = skb->protocol;
struct list_head *head = &offload_base;
int err = -ENOENT;
BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
if (NAPI_GRO_CB(skb)->count == 1) {
skb_shinfo(skb)->gso_size = 0;
goto out;
}
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
if (ptype->type != type || !ptype->callbacks.gro_complete)
continue;
err = ptype->callbacks.gro_complete(skb, 0);
break;
}
rcu_read_unlock();
if (err) {
WARN_ON(&ptype->list == head);
kfree_skb(skb);
return NET_RX_SUCCESS;
}
out:
return netif_receive_skb_internal(skb);
}
/* napi->gro_list contains packets ordered by age.
* youngest packets at the head of it.
* Complete skbs in reverse order to reduce latencies.
*/
void napi_gro_flush(struct napi_struct *napi, bool flush_old)
{
struct sk_buff *skb, *prev = NULL;
/* scan list and build reverse chain */
for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
skb->prev = prev;
prev = skb;
}
for (skb = prev; skb; skb = prev) {
skb->next = NULL;
if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
return;
prev = skb->prev;
napi_gro_complete(skb);
napi->gro_count--;
}
napi->gro_list = NULL;
}
EXPORT_SYMBOL(napi_gro_flush);
static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
{
struct sk_buff *p;
unsigned int maclen = skb->dev->hard_header_len;
u32 hash = skb_get_hash_raw(skb);
for (p = napi->gro_list; p; p = p->next) {
unsigned long diffs;
NAPI_GRO_CB(p)->flush = 0;
if (hash != skb_get_hash_raw(p)) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
diffs |= p->vlan_tci ^ skb->vlan_tci;
diffs |= skb_metadata_dst_cmp(p, skb);
if (maclen == ETH_HLEN)
diffs |= compare_ether_header(skb_mac_header(p),
skb_mac_header(skb));
else if (!diffs)
diffs = memcmp(skb_mac_header(p),
skb_mac_header(skb),
maclen);
NAPI_GRO_CB(p)->same_flow = !diffs;
}
}
static void skb_gro_reset_offset(struct sk_buff *skb)
{
const struct skb_shared_info *pinfo = skb_shinfo(skb);
const skb_frag_t *frag0 = &pinfo->frags[0];
NAPI_GRO_CB(skb)->data_offset = 0;
NAPI_GRO_CB(skb)->frag0 = NULL;
NAPI_GRO_CB(skb)->frag0_len = 0;
if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
pinfo->nr_frags &&
!PageHighMem(skb_frag_page(frag0))) {
NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
skb_frag_size(frag0),
skb->end - skb->tail);
}
}
static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
{
struct skb_shared_info *pinfo = skb_shinfo(skb);
BUG_ON(skb->end - skb->tail < grow);
memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
skb->data_len -= grow;
skb->tail += grow;
pinfo->frags[0].page_offset += grow;
skb_frag_size_sub(&pinfo->frags[0], grow);
if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
skb_frag_unref(skb, 0);
memmove(pinfo->frags, pinfo->frags + 1,
--pinfo->nr_frags * sizeof(pinfo->frags[0]));
}
}
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
struct sk_buff **pp = NULL;
struct packet_offload *ptype;
__be16 type = skb->protocol;
struct list_head *head = &offload_base;
int same_flow;
enum gro_result ret;
int grow;
if (!(skb->dev->features & NETIF_F_GRO))
goto normal;
if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
goto normal;
gro_list_prepare(napi, skb);
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
if (ptype->type != type || !ptype->callbacks.gro_receive)
continue;
skb_set_network_header(skb, skb_gro_offset(skb));
skb_reset_mac_len(skb);
NAPI_GRO_CB(skb)->same_flow = 0;
NAPI_GRO_CB(skb)->flush = 0;
NAPI_GRO_CB(skb)->free = 0;
NAPI_GRO_CB(skb)->encap_mark = 0;
NAPI_GRO_CB(skb)->recursion_counter = 0;
NAPI_GRO_CB(skb)->is_fou = 0;
NAPI_GRO_CB(skb)->is_atomic = 1;
NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
/* Setup for GRO checksum validation */
switch (skb->ip_summed) {
case CHECKSUM_COMPLETE:
NAPI_GRO_CB(skb)->csum = skb->csum;
NAPI_GRO_CB(skb)->csum_valid = 1;
NAPI_GRO_CB(skb)->csum_cnt = 0;
break;
case CHECKSUM_UNNECESSARY:
NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
NAPI_GRO_CB(skb)->csum_valid = 0;
break;
default:
NAPI_GRO_CB(skb)->csum_cnt = 0;
NAPI_GRO_CB(skb)->csum_valid = 0;
}
pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
break;
}
rcu_read_unlock();
if (&ptype->list == head)
goto normal;
same_flow = NAPI_GRO_CB(skb)->same_flow;
ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
if (pp) {
struct sk_buff *nskb = *pp;
*pp = nskb->next;
nskb->next = NULL;
napi_gro_complete(nskb);
napi->gro_count--;
}
if (same_flow)
goto ok;
if (NAPI_GRO_CB(skb)->flush)
goto normal;
if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
struct sk_buff *nskb = napi->gro_list;
/* locate the end of the list to select the 'oldest' flow */
while (nskb->next) {
pp = &nskb->next;
nskb = *pp;
}
*pp = NULL;
nskb->next = NULL;
napi_gro_complete(nskb);
} else {
napi->gro_count++;
}
NAPI_GRO_CB(skb)->count = 1;
NAPI_GRO_CB(skb)->age = jiffies;
NAPI_GRO_CB(skb)->last = skb;
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
skb->next = napi->gro_list;
napi->gro_list = skb;
ret = GRO_HELD;
pull:
grow = skb_gro_offset(skb) - skb_headlen(skb);
if (grow > 0)
gro_pull_from_frag0(skb, grow);
ok:
return ret;
normal:
ret = GRO_NORMAL;
goto pull;
}
struct packet_offload *gro_find_receive_by_type(__be16 type)
{
struct list_head *offload_head = &offload_base;
struct packet_offload *ptype;
list_for_each_entry_rcu(ptype, offload_head, list) {
if (ptype->type != type || !ptype->callbacks.gro_receive)
continue;
return ptype;
}
return NULL;
}
EXPORT_SYMBOL(gro_find_receive_by_type);
struct packet_offload *gro_find_complete_by_type(__be16 type)
{
struct list_head *offload_head = &offload_base;
struct packet_offload *ptype;
list_for_each_entry_rcu(ptype, offload_head, list) {
if (ptype->type != type || !ptype->callbacks.gro_complete)
continue;
return ptype;
}
return NULL;
}
EXPORT_SYMBOL(gro_find_complete_by_type);
static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
{
switch (ret) {
case GRO_NORMAL:
if (netif_receive_skb_internal(skb))
ret = GRO_DROP;
break;
case GRO_DROP:
kfree_skb(skb);
break;
case GRO_MERGED_FREE:
if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
skb_dst_drop(skb);
kmem_cache_free(skbuff_head_cache, skb);
} else {
__kfree_skb(skb);
}
break;
case GRO_HELD:
case GRO_MERGED:
break;
}
return ret;
}
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
skb_mark_napi_id(skb, napi);
trace_napi_gro_receive_entry(skb);
skb_gro_reset_offset(skb);
return napi_skb_finish(dev_gro_receive(napi, skb), skb);
}
EXPORT_SYMBOL(napi_gro_receive);
static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
{
if (unlikely(skb->pfmemalloc)) {
consume_skb(skb);
return;
}
__skb_pull(skb, skb_headlen(skb));
/* restore the reserve we had after netdev_alloc_skb_ip_align() */
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
skb->vlan_tci = 0;
skb->dev = napi->dev;
skb->skb_iif = 0;
skb->encapsulation = 0;
skb_shinfo(skb)->gso_type = 0;
skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
napi->skb = skb;
}
struct sk_buff *napi_get_frags(struct napi_struct *napi)
{
struct sk_buff *skb = napi->skb;
if (!skb) {
skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
if (skb) {
napi->skb = skb;
skb_mark_napi_id(skb, napi);
}
}
return skb;
}
EXPORT_SYMBOL(napi_get_frags);
static gro_result_t napi_frags_finish(struct napi_struct *napi,
struct sk_buff *skb,
gro_result_t ret)
{
switch (ret) {
case GRO_NORMAL:
case GRO_HELD:
__skb_push(skb, ETH_HLEN);
skb->protocol = eth_type_trans(skb, skb->dev);
if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
ret = GRO_DROP;
break;
case GRO_DROP:
case GRO_MERGED_FREE:
napi_reuse_skb(napi, skb);
break;
case GRO_MERGED:
break;
}
return ret;
}
/* Upper GRO stack assumes network header starts at gro_offset=0
* Drivers could call both napi_gro_frags() and napi_gro_receive()
* We copy ethernet header into skb->data to have a common layout.
*/
static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
{
struct sk_buff *skb = napi->skb;
const struct ethhdr *eth;
unsigned int hlen = sizeof(*eth);
napi->skb = NULL;
skb_reset_mac_header(skb);
skb_gro_reset_offset(skb);
eth = skb_gro_header_fast(skb, 0);
if (unlikely(skb_gro_header_hard(skb, hlen))) {
eth = skb_gro_header_slow(skb, hlen, 0);
if (unlikely(!eth)) {
net_warn_ratelimited("%s: dropping impossible skb from %s\n",
__func__, napi->dev->name);
napi_reuse_skb(napi, skb);
return NULL;
}
} else {
gro_pull_from_frag0(skb, hlen);
NAPI_GRO_CB(skb)->frag0 += hlen;
NAPI_GRO_CB(skb)->frag0_len -= hlen;
}
__skb_pull(skb, hlen);
/*
* This works because the only protocols we care about don't require
* special handling.
* We'll fix it up properly in napi_frags_finish()
*/
skb->protocol = eth->h_proto;
return skb;
}
gro_result_t napi_gro_frags(struct napi_struct *napi)
{
struct sk_buff *skb = napi_frags_skb(napi);
if (!skb)
return GRO_DROP;
trace_napi_gro_frags_entry(skb);
return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
}
EXPORT_SYMBOL(napi_gro_frags);
/* Compute the checksum from gro_offset and return the folded value
* after adding in any pseudo checksum.
*/
__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
{
__wsum wsum;
__sum16 sum;
wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
if (likely(!sum)) {
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
!skb->csum_complete_sw)
netdev_rx_csum_fault(skb->dev);
}
NAPI_GRO_CB(skb)->csum = wsum;
NAPI_GRO_CB(skb)->csum_valid = 1;
return sum;
}
EXPORT_SYMBOL(__skb_gro_checksum_complete);
/*
* net_rps_action_and_irq_enable sends any pending IPI's for rps.
* Note: called with local irq disabled, but exits with local irq enabled.
*/
static void net_rps_action_and_irq_enable(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
struct softnet_data *remsd = sd->rps_ipi_list;
if (remsd) {
sd->rps_ipi_list = NULL;
local_irq_enable();
/* Send pending IPI's to kick RPS processing on remote cpus. */
while (remsd) {
struct softnet_data *next = remsd->rps_ipi_next;
if (cpu_online(remsd->cpu))
smp_call_function_single_async(remsd->cpu,
&remsd->csd);
remsd = next;
}
} else
#endif
local_irq_enable();
}
static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
return sd->rps_ipi_list != NULL;
#else
return false;
#endif
}
static int process_backlog(struct napi_struct *napi, int quota)
{
struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
bool again = true;
int work = 0;
/* Check if we have pending ipi, its better to send them now,
* not waiting net_rx_action() end.
*/
if (sd_has_rps_ipi_waiting(sd)) {
local_irq_disable();
net_rps_action_and_irq_enable(sd);
}
napi->weight = weight_p;
while (again) {
struct sk_buff *skb;
while ((skb = __skb_dequeue(&sd->process_queue))) {
rcu_read_lock();
__netif_receive_skb(skb);
rcu_read_unlock();
input_queue_head_incr(sd);
if (++work >= quota)
goto state_changed;
}
local_irq_disable();
rps_lock(sd);
if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
* Inline a custom version of __napi_complete().
* only current cpu owns and manipulates this napi,
* and NAPI_STATE_SCHED is the only possible flag set
* on backlog.
* We can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
napi->state = 0;
again = false;
} else {
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
}
rps_unlock(sd);
local_irq_enable();
}
state_changed:
napi_gro_flush(napi, false);
sd->current_napi = NULL;
return work;
}
/**
* __napi_schedule - schedule for receive
* @n: entry to schedule
*
* The entry's receive function will be scheduled to run.
* Consider using __napi_schedule_irqoff() if hard irqs are masked.
*/
void __napi_schedule(struct napi_struct *n)
{
unsigned long flags;
local_irq_save(flags);
____napi_schedule(this_cpu_ptr(&softnet_data), n);
local_irq_restore(flags);
}
EXPORT_SYMBOL(__napi_schedule);
/**
* __napi_schedule_irqoff - schedule for receive
* @n: entry to schedule
*
* Variant of __napi_schedule() assuming hard irqs are masked
*/
void __napi_schedule_irqoff(struct napi_struct *n)
{
____napi_schedule(this_cpu_ptr(&softnet_data), n);
}
EXPORT_SYMBOL(__napi_schedule_irqoff);
void __napi_complete(struct napi_struct *n)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
list_del_init(&n->poll_list);
smp_mb__before_atomic();
sd->current_napi = NULL;
clear_bit(NAPI_STATE_SCHED, &n->state);
}
EXPORT_SYMBOL(__napi_complete);
void napi_complete_done(struct napi_struct *n, int work_done)
{
unsigned long flags;
/*
* don't let napi dequeue from the cpu poll list
* just in case its running on a different cpu
*/
if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
return;
if (n->gro_list) {
unsigned long timeout = 0;
if (work_done)
timeout = n->dev->gro_flush_timeout;
if (timeout)
hrtimer_start(&n->timer, ns_to_ktime(timeout),
HRTIMER_MODE_REL_PINNED);
else
napi_gro_flush(n, false);
}
if (likely(list_empty(&n->poll_list))) {
WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
} else {
/* If n->poll_list is not empty, we need to mask irqs */
local_irq_save(flags);
__napi_complete(n);
local_irq_restore(flags);
}
}
EXPORT_SYMBOL(napi_complete_done);
/* must be called under rcu_read_lock(), as we dont take a reference */
static struct napi_struct *napi_by_id(unsigned int napi_id)
{
unsigned int hash = napi_id % HASH_SIZE(napi_hash);
struct napi_struct *napi;
hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
if (napi->napi_id == napi_id)
return napi;
return NULL;
}
#if defined(CONFIG_NET_RX_BUSY_POLL)
#define BUSY_POLL_BUDGET 8
bool sk_busy_loop(struct sock *sk, int nonblock)
{
unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
int (*busy_poll)(struct napi_struct *dev);
struct napi_struct *napi;
int rc = false;
rcu_read_lock();
napi = napi_by_id(sk->sk_napi_id);
if (!napi)
goto out;
/* Note: ndo_busy_poll method is optional in linux-4.5 */
busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
do {
rc = 0;
local_bh_disable();
if (busy_poll) {
rc = busy_poll(napi);
} else if (napi_schedule_prep(napi)) {
void *have = netpoll_poll_lock(napi);
if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
rc = napi->poll(napi, BUSY_POLL_BUDGET);
trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
if (rc == BUSY_POLL_BUDGET) {
napi_complete_done(napi, rc);
napi_schedule(napi);
}
}
netpoll_poll_unlock(have);
}
if (rc > 0)
__NET_ADD_STATS(sock_net(sk),
LINUX_MIB_BUSYPOLLRXPACKETS, rc);
local_bh_enable();
if (rc == LL_FLUSH_FAILED)
break; /* permanent failure */
cpu_relax();
} while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
!need_resched() && !busy_loop_timeout(end_time));
rc = !skb_queue_empty(&sk->sk_receive_queue);
out:
rcu_read_unlock();
return rc;
}
EXPORT_SYMBOL(sk_busy_loop);
#endif /* CONFIG_NET_RX_BUSY_POLL */
void napi_hash_add(struct napi_struct *napi)
{
if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
return;
spin_lock(&napi_hash_lock);
/* 0..NR_CPUS+1 range is reserved for sender_cpu use */
do {
if (unlikely(++napi_gen_id < NR_CPUS + 1))
napi_gen_id = NR_CPUS + 1;
} while (napi_by_id(napi_gen_id));
napi->napi_id = napi_gen_id;
hlist_add_head_rcu(&napi->napi_hash_node,
&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
spin_unlock(&napi_hash_lock);
}
EXPORT_SYMBOL_GPL(napi_hash_add);
/* Warning : caller is responsible to make sure rcu grace period
* is respected before freeing memory containing @napi
*/
bool napi_hash_del(struct napi_struct *napi)
{
bool rcu_sync_needed = false;
spin_lock(&napi_hash_lock);
if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
rcu_sync_needed = true;
hlist_del_rcu(&napi->napi_hash_node);
}
spin_unlock(&napi_hash_lock);
return rcu_sync_needed;
}
EXPORT_SYMBOL_GPL(napi_hash_del);
static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
{
struct napi_struct *napi;
napi = container_of(timer, struct napi_struct, timer);
if (napi->gro_list)
napi_schedule(napi);
return HRTIMER_NORESTART;
}
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
INIT_LIST_HEAD(&napi->poll_list);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
napi->gro_count = 0;
napi->gro_list = NULL;
napi->skb = NULL;
napi->poll = poll;
if (weight > NAPI_POLL_WEIGHT)
pr_err_once("netif_napi_add() called with weight %d on device %s\n",
weight, dev->name);
napi->weight = weight;
list_add(&napi->dev_list, &dev->napi_list);
napi->dev = dev;
#ifdef CONFIG_NETPOLL
spin_lock_init(&napi->poll_lock);
napi->poll_owner = -1;
#endif
set_bit(NAPI_STATE_SCHED, &napi->state);
napi_hash_add(napi);
}
EXPORT_SYMBOL(netif_napi_add);
void napi_disable(struct napi_struct *n)
{
might_sleep();
set_bit(NAPI_STATE_DISABLE, &n->state);
while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
msleep(1);
while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
msleep(1);
hrtimer_cancel(&n->timer);
clear_bit(NAPI_STATE_DISABLE, &n->state);
}
EXPORT_SYMBOL(napi_disable);
/* Must be called in process context */
void netif_napi_del(struct napi_struct *napi)
{
might_sleep();
if (napi_hash_del(napi))
synchronize_net();
list_del_init(&napi->dev_list);
napi_free_frags(napi);
kfree_skb_list(napi->gro_list);
napi->gro_list = NULL;
napi->gro_count = 0;
}
EXPORT_SYMBOL(netif_napi_del);
struct napi_struct *get_current_napi_context(void)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
return sd->current_napi;
}
EXPORT_SYMBOL(get_current_napi_context);
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
void *have;
int work, weight;
list_del_init(&n->poll_list);
have = netpoll_poll_lock(n);
weight = n->weight;
/* This NAPI_STATE_SCHED test is for avoiding a race
* with netpoll's poll_napi(). Only the entity which
* obtains the lock and sees NAPI_STATE_SCHED set will
* actually make the ->poll() call. Therefore we avoid
* accidentally calling ->poll() when NAPI is not scheduled.
*/
work = 0;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
sd->current_napi = n;
work = n->poll(n, weight);
trace_napi_poll(n, work, weight);
}
WARN_ON_ONCE(work > weight);
if (likely(work < weight))
goto out_unlock;
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
* still "owns" the NAPI instance and therefore can
* move the instance around on the list at-will.
*/
if (unlikely(napi_disable_pending(n))) {
napi_complete(n);
goto out_unlock;
}
if (n->gro_list) {
/* flush too old packets
* If HZ < 1000, flush all packets.
*/
napi_gro_flush(n, HZ >= 1000);
}
/* Some drivers may have called napi_schedule
* prior to exhausting their budget.
*/
if (unlikely(!list_empty(&n->poll_list))) {
pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
n->dev ? n->dev->name : "backlog");
goto out_unlock;
}
list_add_tail(&n->poll_list, repoll);
out_unlock:
netpoll_poll_unlock(have);
return work;
}
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies + 2;
int budget = netdev_budget;
LIST_HEAD(list);
LIST_HEAD(repoll);
local_irq_disable();
list_splice_init(&sd->poll_list, &list);
local_irq_enable();
for (;;) {
struct napi_struct *n;
if (list_empty(&list)) {
if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
return;
break;
}
n = list_first_entry(&list, struct napi_struct, poll_list);
budget -= napi_poll(n, &repoll);
/* If softirq window is exhausted then punt.
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
if (unlikely(budget <= 0 ||
time_after_eq(jiffies, time_limit))) {
sd->time_squeeze++;
break;
}
}
__kfree_skb_flush();
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
list_splice_tail(&repoll, &list);
list_splice(&list, &sd->poll_list);
if (!list_empty(&sd->poll_list))
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
}
struct netdev_adjacent {
struct net_device *dev;
/* upper master flag, there can only be one master device per list */
bool master;
/* counter for the number of times this device was added to us */
u16 ref_nr;
/* private field for the users */
void *private;
struct list_head list;
struct rcu_head rcu;
};
static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
struct list_head *adj_list)
{
struct netdev_adjacent *adj;
list_for_each_entry(adj, adj_list, list) {
if (adj->dev == adj_dev)
return adj;
}
return NULL;
}
/**
* netdev_has_upper_dev - Check if device is linked to an upper device
* @dev: device
* @upper_dev: upper device to check
*
* Find out if a device is linked to specified upper device and return true
* in case it is. Note that this checks only immediate upper device,
* not through a complete stack of devices. The caller must hold the RTNL lock.
*/
bool netdev_has_upper_dev(struct net_device *dev,
struct net_device *upper_dev)
{
ASSERT_RTNL();
return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
}
EXPORT_SYMBOL(netdev_has_upper_dev);
/**
* netdev_has_any_upper_dev - Check if device is linked to some device
* @dev: device
*
* Find out if a device is linked to an upper device and return true in case
* it is. The caller must hold the RTNL lock.
*/
static bool netdev_has_any_upper_dev(struct net_device *dev)
{
ASSERT_RTNL();
return !list_empty(&dev->all_adj_list.upper);
}
/**
* netdev_master_upper_dev_get - Get master upper device
* @dev: device
*
* Find a master upper device and return pointer to it or NULL in case
* it's not there. The caller must hold the RTNL lock.
*/
struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
{
struct netdev_adjacent *upper;
ASSERT_RTNL();
if (list_empty(&dev->adj_list.upper))
return NULL;
upper = list_first_entry(&dev->adj_list.upper,
struct netdev_adjacent, list);
if (likely(upper->master))
return upper->dev;
return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get);
void *netdev_adjacent_get_private(struct list_head *adj_list)
{
struct netdev_adjacent *adj;
adj = list_entry(adj_list, struct netdev_adjacent, list);
return adj->private;
}
EXPORT_SYMBOL(netdev_adjacent_get_private);
/**
* netdev_upper_get_next_dev_rcu - Get the next dev from upper list
* @dev: device
* @iter: list_head ** of the current position
*
* Gets the next device from the dev's upper list, starting from iter
* position. The caller must hold RCU read lock.
*/
struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
struct list_head **iter)
{
struct netdev_adjacent *upper;
WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
if (&upper->list == &dev->adj_list.upper)
return NULL;
*iter = &upper->list;
return upper->dev;
}
EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
/**
* netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
* @dev: device
* @iter: list_head ** of the current position
*
* Gets the next device from the dev's upper list, starting from iter
* position. The caller must hold RCU read lock.
*/
struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
struct list_head **iter)
{
struct netdev_adjacent *upper;
WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
if (&upper->list == &dev->all_adj_list.upper)
return NULL;
*iter = &upper->list;
return upper->dev;
}
EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
/**
* netdev_lower_get_next_private - Get the next ->private from the
* lower neighbour list
* @dev: device
* @iter: list_head ** of the current position
*
* Gets the next netdev_adjacent->private from the dev's lower neighbour
* list, starting from iter position. The caller must hold either hold the
* RTNL lock or its own locking that guarantees that the neighbour lower
* list will remain unchanged.
*/
void *netdev_lower_get_next_private(struct net_device *dev,
struct list_head **iter)
{
struct netdev_adjacent *lower;
lower = list_entry(*iter, struct netdev_adjacent, list);
if (&lower->list == &dev->adj_list.lower)
return NULL;
*iter = lower->list.next;
return lower->private;
}
EXPORT_SYMBOL(netdev_lower_get_next_private);
/**
* netdev_lower_get_next_private_rcu - Get the next ->private from the
* lower neighbour list, RCU
* variant
* @dev: device
* @iter: list_head ** of the current position
*
* Gets the next netdev_adjacent->private from the dev's lower neighbour
* list, starting from iter position. The caller must hold RCU read lock.
*/
void *netdev_lower_get_next_private_rcu(struct net_device *dev,
struct list_head **iter)
{
struct netdev_adjacent *lower;
WARN_ON_ONCE(!rcu_read_lock_held());
lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
if (&lower->list == &dev->adj_list.lower)
return NULL;
*iter = &lower->list;
return lower->private;
}
EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
/**
* netdev_lower_get_next - Get the next device from the lower neighbour
* list
* @dev: device
* @iter: list_head ** of the current position
*
* Gets the next netdev_adjacent from the dev's lower neighbour
* list, starting from iter position. The caller must hold RTNL lock or
* its own locking that guarantees that the neighbour lower
* list will remain unchanged.
*/
void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
{
struct netdev_adjacent *lower;
lower = list_entry(*iter, struct netdev_adjacent, list);
if (&lower->list == &dev->adj_list.lower)
return NULL;
*iter = lower->list.next;
return lower->dev;
}
EXPORT_SYMBOL(netdev_lower_get_next);
/**
* netdev_all_lower_get_next - Get the next device from all lower neighbour list
* @dev: device
* @iter: list_head ** of the current position
*
* Gets the next netdev_adjacent from the dev's all lower neighbour
* list, starting from iter position. The caller must hold RTNL lock or
* its own locking that guarantees that the neighbour all lower
* list will remain unchanged.
*/
struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter)
{
struct netdev_adjacent *lower;
lower = list_entry(*iter, struct netdev_adjacent, list);
if (&lower->list == &dev->all_adj_list.lower)
return NULL;
*iter = lower->list.next;
return lower->dev;
}
EXPORT_SYMBOL(netdev_all_lower_get_next);
/**
* netdev_all_lower_get_next_rcu - Get the next device from all
* lower neighbour list, RCU variant
* @dev: device
* @iter: list_head ** of the current position
*
* Gets the next netdev_adjacent from the dev's all lower neighbour
* list, starting from iter position. The caller must hold RCU read lock.
*/
struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
struct list_head **iter)
{
struct netdev_adjacent *lower;
lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
if (&lower->list == &dev->all_adj_list.lower)
return NULL;
*iter = &lower->list;
return lower->dev;
}
EXPORT_SYMBOL(netdev_all_lower_get_next_rcu);
/**
* netdev_lower_get_first_private_rcu - Get the first ->private from the
* lower neighbour list, RCU
* variant
* @dev: device
*
* Gets the first netdev_adjacent->private from the dev's lower neighbour
* list. The caller must hold RCU read lock.
*/
void *netdev_lower_get_first_private_rcu(struct net_device *dev)
{
struct netdev_adjacent *lower;
lower = list_first_or_null_rcu(&dev->adj_list.lower,
struct netdev_adjacent, list);
if (lower)
return lower->private;
return NULL;
}
EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
/**
* netdev_master_upper_dev_get_rcu - Get master upper device
* @dev: device
*
* Find a master upper device and return pointer to it or NULL in case
* it's not there. The caller must hold the RCU read lock.
*/
struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
{
struct netdev_adjacent *upper;
upper = list_first_or_null_rcu(&dev->adj_list.upper,
struct netdev_adjacent, list);
if (upper && likely(upper->master))
return upper->dev;
return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
static int netdev_adjacent_sysfs_add(struct net_device *dev,
struct net_device *adj_dev,
struct list_head *dev_list)
{
char linkname[IFNAMSIZ+7];
sprintf(linkname, dev_list == &dev->adj_list.upper ?
"upper_%s" : "lower_%s", adj_dev->name);
return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
linkname);
}
static void netdev_adjacent_sysfs_del(struct net_device *dev,
char *name,
struct list_head *dev_list)
{
char linkname[IFNAMSIZ+7];
sprintf(linkname, dev_list == &dev->adj_list.upper ?
"upper_%s" : "lower_%s", name);
sysfs_remove_link(&(dev->dev.kobj), linkname);
}
static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
struct net_device *adj_dev,
struct list_head *dev_list)
{
return (dev_list == &dev->adj_list.upper ||
dev_list == &dev->adj_list.lower) &&
net_eq(dev_net(dev), dev_net(adj_dev));
}
static int __netdev_adjacent_dev_insert(struct net_device *dev,
struct net_device *adj_dev,
u16 ref_nr,
struct list_head *dev_list,
void *private, bool master)
{
struct netdev_adjacent *adj;
int ret;
adj = __netdev_find_adj(adj_dev, dev_list);
if (adj) {
adj->ref_nr += ref_nr;
return 0;
}
adj = kmalloc(sizeof(*adj), GFP_KERNEL);
if (!adj)
return -ENOMEM;
adj->dev = adj_dev;
adj->master = master;
adj->ref_nr = ref_nr;
adj->private = private;
dev_hold(adj_dev);
pr_debug("dev_hold for %s, because of link added from %s to %s\n",
adj_dev->name, dev->name, adj_dev->name);
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
if (ret)
goto free_adj;
}
/* Ensure that master link is always the first item in list. */
if (master) {
ret = sysfs_create_link(&(dev->dev.kobj),
&(adj_dev->dev.kobj), "master");
if (ret)
goto remove_symlinks;
list_add_rcu(&adj->list, dev_list);
} else {
list_add_tail_rcu(&adj->list, dev_list);
}
return 0;
remove_symlinks:
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
free_adj:
kfree(adj);
dev_put(adj_dev);
return ret;
}
static void __netdev_adjacent_dev_remove(struct net_device *dev,
struct net_device *adj_dev,
u16 ref_nr,
struct list_head *dev_list)
{
struct netdev_adjacent *adj;
adj = __netdev_find_adj(adj_dev, dev_list);
if (!adj) {
pr_err("tried to remove device %s from %s\n",
dev->name, adj_dev->name);
BUG();
}
if (adj->ref_nr > ref_nr) {
pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
ref_nr, adj->ref_nr-ref_nr);
adj->ref_nr -= ref_nr;
return;
}
if (adj->master)
sysfs_remove_link(&(dev->dev.kobj), "master");
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
list_del_rcu(&adj->list);
pr_debug("dev_put for %s, because link removed from %s to %s\n",
adj_dev->name, dev->name, adj_dev->name);
dev_put(adj_dev);
kfree_rcu(adj, rcu);
}
static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
struct net_device *upper_dev,
u16 ref_nr,
struct list_head *up_list,
struct list_head *down_list,
void *private, bool master)
{
int ret;
ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
private, master);
if (ret)
return ret;
ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
private, false);
if (ret) {
__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
return ret;
}
return 0;
}
static int __netdev_adjacent_dev_link(struct net_device *dev,
struct net_device *upper_dev,
u16 ref_nr)
{
return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
&dev->all_adj_list.upper,
&upper_dev->all_adj_list.lower,
NULL, false);
}
static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
struct net_device *upper_dev,
u16 ref_nr,
struct list_head *up_list,
struct list_head *down_list)
{
__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
}
static void __netdev_adjacent_dev_unlink(struct net_device *dev,
struct net_device *upper_dev,
u16 ref_nr)
{
__netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
&dev->all_adj_list.upper,
&upper_dev->all_adj_list.lower);
}
static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
struct net_device *upper_dev,
void *private, bool master)
{
int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
if (ret)
return ret;
ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
&dev->adj_list.upper,
&upper_dev->adj_list.lower,
private, master);
if (ret) {
__netdev_adjacent_dev_unlink(dev, upper_dev, 1);
return ret;
}
return 0;
}
static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
struct net_device *upper_dev)
{
__netdev_adjacent_dev_unlink(dev, upper_dev, 1);
__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
&dev->adj_list.upper,
&upper_dev->adj_list.lower);
}
static int __netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev, bool master,
void *upper_priv, void *upper_info)
{
struct netdev_notifier_changeupper_info changeupper_info;
struct netdev_adjacent *i, *j, *to_i, *to_j;
int ret = 0;
ASSERT_RTNL();
if (dev == upper_dev)
return -EBUSY;
/* To prevent loops, check if dev is not upper device to upper_dev. */
if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
return -EBUSY;
if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
return -EEXIST;
if (master && netdev_master_upper_dev_get(dev))
return -EBUSY;
changeupper_info.upper_dev = upper_dev;
changeupper_info.master = master;
changeupper_info.linking = true;
changeupper_info.upper_info = upper_info;
ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
&changeupper_info.info);
ret = notifier_to_errno(ret);
if (ret)
return ret;
ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
master);
if (ret)
return ret;
/* Now that we linked these devs, make all the upper_dev's
* all_adj_list.upper visible to every dev's all_adj_list.lower an
* versa, and don't forget the devices itself. All of these
* links are non-neighbours.
*/
list_for_each_entry(i, &dev->all_adj_list.lower, list) {
list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
pr_debug("Interlinking %s with %s, non-neighbour\n",
i->dev->name, j->dev->name);
ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
if (ret)
goto rollback_mesh;
}
}
/* add dev to every upper_dev's upper device */
list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
pr_debug("linking %s's upper device %s with %s\n",
upper_dev->name, i->dev->name, dev->name);
ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
if (ret)
goto rollback_upper_mesh;
}
/* add upper_dev to every dev's lower device */
list_for_each_entry(i, &dev->all_adj_list.lower, list) {
pr_debug("linking %s's lower device %s with %s\n", dev->name,
i->dev->name, upper_dev->name);
ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
if (ret)
goto rollback_lower_mesh;
}
ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
&changeupper_info.info);
ret = notifier_to_errno(ret);
if (ret)
goto rollback_lower_mesh;
return 0;
rollback_lower_mesh:
to_i = i;
list_for_each_entry(i, &dev->all_adj_list.lower, list) {
if (i == to_i)
break;
__netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
}
i = NULL;
rollback_upper_mesh:
to_i = i;
list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
if (i == to_i)
break;
__netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
}
i = j = NULL;
rollback_mesh:
to_i = i;
to_j = j;
list_for_each_entry(i, &dev->all_adj_list.lower, list) {
list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
if (i == to_i && j == to_j)
break;
__netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
}
if (i == to_i)
break;
}
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
return ret;
}
/**
* netdev_upper_dev_link - Add a link to the upper device
* @dev: device
* @upper_dev: new upper device
*
* Adds a link to device which is upper to this one. The caller must hold
* the RTNL lock. On a failure a negative errno code is returned.
* On success the reference counts are adjusted and the function
* returns zero.
*/
int netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev)
{
return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
}
EXPORT_SYMBOL(netdev_upper_dev_link);
/**
* netdev_master_upper_dev_link - Add a master link to the upper device
* @dev: device
* @upper_dev: new upper device
* @upper_priv: upper device private
* @upper_info: upper info to be passed down via notifier
*
* Adds a link to device which is upper to this one. In this case, only
* one master upper device can be linked, although other non-master devices
* might be linked as well. The caller must hold the RTNL lock.
* On a failure a negative errno code is returned. On success the reference
* counts are adjusted and the function returns zero.
*/
int netdev_master_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev,
void *upper_priv, void *upper_info)
{
return __netdev_upper_dev_link(dev, upper_dev, true,
upper_priv, upper_info);
}
EXPORT_SYMBOL(netdev_master_upper_dev_link);
/**
* netdev_upper_dev_unlink - Removes a link to upper device
* @dev: device
* @upper_dev: new upper device
*
* Removes a link to device which is upper to this one. The caller must hold
* the RTNL lock.
*/
void netdev_upper_dev_unlink(struct net_device *dev,
struct net_device *upper_dev)
{
struct netdev_notifier_changeupper_info changeupper_info;
struct netdev_adjacent *i, *j;
ASSERT_RTNL();
changeupper_info.upper_dev = upper_dev;
changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
changeupper_info.linking = false;
call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
&changeupper_info.info);
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
/* Here is the tricky part. We must remove all dev's lower
* devices from all upper_dev's upper devices and vice
* versa, to maintain the graph relationship.
*/
list_for_each_entry(i, &dev->all_adj_list.lower, list)
list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
__netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
/* remove also the devices itself from lower/upper device
* list
*/
list_for_each_entry(i, &dev->all_adj_list.lower, list)
__netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
__netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
&changeupper_info.info);
}
EXPORT_SYMBOL(netdev_upper_dev_unlink);
/**
* netdev_bonding_info_change - Dispatch event about slave change
* @dev: device
* @bonding_info: info to dispatch
*
* Send NETDEV_BONDING_INFO to netdev notifiers with info.
* The caller must hold the RTNL lock.
*/
void netdev_bonding_info_change(struct net_device *dev,
struct netdev_bonding_info *bonding_info)
{
struct netdev_notifier_bonding_info info;
memcpy(&info.bonding_info, bonding_info,
sizeof(struct netdev_bonding_info));
call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
&info.info);
}
EXPORT_SYMBOL(netdev_bonding_info_change);
static void netdev_adjacent_add_links(struct net_device *dev)
{
struct netdev_adjacent *iter;
struct net *net = dev_net(dev);
list_for_each_entry(iter, &dev->adj_list.upper, list) {
if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_add(iter->dev, dev,
&iter->dev->adj_list.lower);
netdev_adjacent_sysfs_add(dev, iter->dev,
&dev->adj_list.upper);
}
list_for_each_entry(iter, &dev->adj_list.lower, list) {
if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_add(iter->dev, dev,
&iter->dev->adj_list.upper);
netdev_adjacent_sysfs_add(dev, iter->dev,
&dev->adj_list.lower);
}
}
static void netdev_adjacent_del_links(struct net_device *dev)
{
struct netdev_adjacent *iter;
struct net *net = dev_net(dev);
list_for_each_entry(iter, &dev->adj_list.upper, list) {
if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_del(iter->dev, dev->name,
&iter->dev->adj_list.lower);
netdev_adjacent_sysfs_del(dev, iter->dev->name,
&dev->adj_list.upper);
}
list_for_each_entry(iter, &dev->adj_list.lower, list) {
if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_del(iter->dev, dev->name,
&iter->dev->adj_list.upper);
netdev_adjacent_sysfs_del(dev, iter->dev->name,
&dev->adj_list.lower);
}
}
void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
{
struct netdev_adjacent *iter;
struct net *net = dev_net(dev);
list_for_each_entry(iter, &dev->adj_list.upper, list) {
if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_del(iter->dev, oldname,
&iter->dev->adj_list.lower);
netdev_adjacent_sysfs_add(iter->dev, dev,
&iter->dev->adj_list.lower);
}
list_for_each_entry(iter, &dev->adj_list.lower, list) {
if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_del(iter->dev, oldname,
&iter->dev->adj_list.upper);
netdev_adjacent_sysfs_add(iter->dev, dev,
&iter->dev->adj_list.upper);
}
}
void *netdev_lower_dev_get_private(struct net_device *dev,
struct net_device *lower_dev)
{
struct netdev_adjacent *lower;
if (!lower_dev)
return NULL;
lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
if (!lower)
return NULL;
return lower->private;
}
EXPORT_SYMBOL(netdev_lower_dev_get_private);
int dev_get_nest_level(struct net_device *dev)
{
struct net_device *lower = NULL;
struct list_head *iter;
int max_nest = -1;
int nest;
ASSERT_RTNL();
netdev_for_each_lower_dev(dev, lower, iter) {
nest = dev_get_nest_level(lower);
if (max_nest < nest)
max_nest = nest;
}
return max_nest + 1;
}
EXPORT_SYMBOL(dev_get_nest_level);
/**
* netdev_lower_change - Dispatch event about lower device state change
* @lower_dev: device
* @lower_state_info: state to dispatch
*
* Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
* The caller must hold the RTNL lock.
*/
void netdev_lower_state_changed(struct net_device *lower_dev,
void *lower_state_info)
{
struct netdev_notifier_changelowerstate_info changelowerstate_info;
ASSERT_RTNL();
changelowerstate_info.lower_state_info = lower_state_info;
call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
&changelowerstate_info.info);
}
EXPORT_SYMBOL(netdev_lower_state_changed);
int netdev_default_l2upper_neigh_construct(struct net_device *dev,
struct neighbour *n)
{
struct net_device *lower_dev, *stop_dev;
struct list_head *iter;
int err;
netdev_for_each_lower_dev(dev, lower_dev, iter) {
if (!lower_dev->netdev_ops->ndo_neigh_construct)
continue;
err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
if (err) {
stop_dev = lower_dev;
goto rollback;
}
}
return 0;
rollback:
netdev_for_each_lower_dev(dev, lower_dev, iter) {
if (lower_dev == stop_dev)
break;
if (!lower_dev->netdev_ops->ndo_neigh_destroy)
continue;
lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
}
return err;
}
EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
struct neighbour *n)
{
struct net_device *lower_dev;
struct list_head *iter;
netdev_for_each_lower_dev(dev, lower_dev, iter) {
if (!lower_dev->netdev_ops->ndo_neigh_destroy)
continue;
lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
}
}
EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
static void dev_change_rx_flags(struct net_device *dev, int flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_change_rx_flags)
ops->ndo_change_rx_flags(dev, flags);
}
static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags;
kuid_t uid;
kgid_t gid;
ASSERT_RTNL();
dev->flags |= IFF_PROMISC;
dev->promiscuity += inc;
if (dev->promiscuity == 0) {
/*
* Avoid overflow.
* If inc causes overflow, untouch promisc and return error.
*/
if (inc < 0)
dev->flags &= ~IFF_PROMISC;
else {
dev->promiscuity -= inc;
pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
dev->name);
return -EOVERFLOW;
}
}
if (dev->flags != old_flags) {
pr_info("device %s %s promiscuous mode\n",
dev->name,
dev->flags & IFF_PROMISC ? "entered" : "left");
if (audit_enabled) {
current_uid_gid(&uid, &gid);
audit_log(current->audit_context, GFP_ATOMIC,
AUDIT_ANOM_PROMISCUOUS,
"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
dev->name, (dev->flags & IFF_PROMISC),
(old_flags & IFF_PROMISC),
from_kuid(&init_user_ns, audit_get_loginuid(current)),
from_kuid(&init_user_ns, uid),
from_kgid(&init_user_ns, gid),
audit_get_sessionid(current));
}
dev_change_rx_flags(dev, IFF_PROMISC);
}
if (notify)
__dev_notify_flags(dev, old_flags, IFF_PROMISC);
return 0;
}
/**
* dev_set_promiscuity - update promiscuity count on a device
* @dev: device
* @inc: modifier
*
* Add or remove promiscuity from a device. While the count in the device
* remains above zero the interface remains promiscuous. Once it hits zero
* the device reverts back to normal filtering operation. A negative inc
* value is used to drop promiscuity on the device.
* Return 0 if successful or a negative errno code on error.
*/
int dev_set_promiscuity(struct net_device *dev, int inc)
{
unsigned int old_flags = dev->flags;
int err;
err = __dev_set_promiscuity(dev, inc, true);
if (err < 0)
return err;
if (dev->flags != old_flags)
dev_set_rx_mode(dev);
return err;
}
EXPORT_SYMBOL(dev_set_promiscuity);
static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
ASSERT_RTNL();
dev->flags |= IFF_ALLMULTI;
dev->allmulti += inc;
if (dev->allmulti == 0) {
/*
* Avoid overflow.
* If inc causes overflow, untouch allmulti and return error.
*/
if (inc < 0)
dev->flags &= ~IFF_ALLMULTI;
else {
dev->allmulti -= inc;
pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
dev->name);
return -EOVERFLOW;
}
}
if (dev->flags ^ old_flags) {
dev_change_rx_flags(dev, IFF_ALLMULTI);
dev_set_rx_mode(dev);
if (notify)
__dev_notify_flags(dev, old_flags,
dev->gflags ^ old_gflags);
}
return 0;
}
/**
* dev_set_allmulti - update allmulti count on a device
* @dev: device
* @inc: modifier
*
* Add or remove reception of all multicast frames to a device. While the
* count in the device remains above zero the interface remains listening
* to all interfaces. Once it hits zero the device reverts back to normal
* filtering operation. A negative @inc value is used to drop the counter
* when releasing a resource needing all multicasts.
* Return 0 if successful or a negative errno code on error.
*/
int dev_set_allmulti(struct net_device *dev, int inc)
{
return __dev_set_allmulti(dev, inc, true);
}
EXPORT_SYMBOL(dev_set_allmulti);
/*
* Upload unicast and multicast address lists to device and
* configure RX filtering. When the device doesn't support unicast
* filtering it is put in promiscuous mode while unicast addresses
* are present.
*/
void __dev_set_rx_mode(struct net_device *dev)
{
const struct net_device_ops *ops = dev->netdev_ops;
/* dev_open will call this function so the list will stay sane. */
if (!(dev->flags&IFF_UP))
return;
if (!netif_device_present(dev))
return;
if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
/* Unicast addresses changes may only happen under the rtnl,
* therefore calling __dev_set_promiscuity here is safe.
*/
if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
__dev_set_promiscuity(dev, 1, false);
dev->uc_promisc = true;
} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
__dev_set_promiscuity(dev, -1, false);
dev->uc_promisc = false;
}
}
if (ops->ndo_set_rx_mode)
ops->ndo_set_rx_mode(dev);
}
void dev_set_rx_mode(struct net_device *dev)
{
netif_addr_lock_bh(dev);
__dev_set_rx_mode(dev);
netif_addr_unlock_bh(dev);
}
/**
* dev_get_flags - get flags reported to userspace
* @dev: device
*
* Get the combination of flag bits exported through APIs to userspace.
*/
unsigned int dev_get_flags(const struct net_device *dev)
{
unsigned int flags;
flags = (dev->flags & ~(IFF_PROMISC |
IFF_ALLMULTI |
IFF_RUNNING |
IFF_LOWER_UP |
IFF_DORMANT)) |
(dev->gflags & (IFF_PROMISC |
IFF_ALLMULTI));
if (netif_running(dev)) {
if (netif_oper_up(dev))
flags |= IFF_RUNNING;
if (netif_carrier_ok(dev))
flags |= IFF_LOWER_UP;
if (netif_dormant(dev))
flags |= IFF_DORMANT;
}
return flags;
}
EXPORT_SYMBOL(dev_get_flags);
int __dev_change_flags(struct net_device *dev, unsigned int flags)
{
unsigned int old_flags = dev->flags;
int ret;
ASSERT_RTNL();
/*
* Set the flags on our device.
*/
dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
IFF_AUTOMEDIA)) |
(dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
IFF_ALLMULTI));
/*
* Load in the correct multicast list now the flags have changed.
*/
if ((old_flags ^ flags) & IFF_MULTICAST)
dev_change_rx_flags(dev, IFF_MULTICAST);
dev_set_rx_mode(dev);
/*
* Have we downed the interface. We handle IFF_UP ourselves
* according to user attempts to set it, rather than blindly
* setting it.
*/
ret = 0;
if ((old_flags ^ flags) & IFF_UP)
ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
if ((flags ^ dev->gflags) & IFF_PROMISC) {
int inc = (flags & IFF_PROMISC) ? 1 : -1;
unsigned int old_flags = dev->flags;
dev->gflags ^= IFF_PROMISC;
if (__dev_set_promiscuity(dev, inc, false) >= 0)
if (dev->flags != old_flags)
dev_set_rx_mode(dev);
}
/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
is important. Some (broken) drivers set IFF_PROMISC, when
IFF_ALLMULTI is requested not asking us and not reporting.
*/
if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
dev->gflags ^= IFF_ALLMULTI;
__dev_set_allmulti(dev, inc, false);
}
return ret;
}
void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
unsigned int gchanges)
{
unsigned int changes = dev->flags ^ old_flags;
if (gchanges)
rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
if (changes & IFF_UP) {
if (dev->flags & IFF_UP)
call_netdevice_notifiers(NETDEV_UP, dev);
else
call_netdevice_notifiers(NETDEV_DOWN, dev);
}
if (dev->flags & IFF_UP &&
(changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
struct netdev_notifier_change_info change_info;
change_info.flags_changed = changes;
call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
&change_info.info);
}
}
/**
* dev_change_flags - change device settings
* @dev: device
* @flags: device state flags
*
* Change settings on device based state flags. The flags are
* in the userspace exported format.
*/
int dev_change_flags(struct net_device *dev, unsigned int flags)
{
int ret;
unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
ret = __dev_change_flags(dev, flags);
if (ret < 0)
return ret;
changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
__dev_notify_flags(dev, old_flags, changes);
return ret;
}
EXPORT_SYMBOL(dev_change_flags);
static int __dev_set_mtu(struct net_device *dev, int new_mtu)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_change_mtu)
return ops->ndo_change_mtu(dev, new_mtu);
dev->mtu = new_mtu;
return 0;
}
/**
* dev_set_mtu - Change maximum transfer unit
* @dev: device
* @new_mtu: new transfer unit
*
* Change the maximum transfer size of the network device.
*/
int dev_set_mtu(struct net_device *dev, int new_mtu)
{
int err, orig_mtu;
if (new_mtu == dev->mtu)
return 0;
/* MTU must be positive. */
if (new_mtu < 0)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
err = notifier_to_errno(err);
if (err)
return err;
orig_mtu = dev->mtu;
err = __dev_set_mtu(dev, new_mtu);
if (!err) {
err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
err = notifier_to_errno(err);
if (err) {
/* setting mtu back and notifying everyone again,
* so that they have a chance to revert changes.
*/
__dev_set_mtu(dev, orig_mtu);
call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
}
}
return err;
}
EXPORT_SYMBOL(dev_set_mtu);
/**
* dev_set_group - Change group this device belongs to
* @dev: device
* @new_group: group this device should belong to
*/
void dev_set_group(struct net_device *dev, int new_group)
{
dev->group = new_group;
}
EXPORT_SYMBOL(dev_set_group);
/**
* dev_set_mac_address - Change Media Access Control Address
* @dev: device
* @sa: new address
*
* Change the hardware (MAC) address of the device
*/
int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
{
const struct net_device_ops *ops = dev->netdev_ops;
int err;
if (!ops->ndo_set_mac_address)
return -EOPNOTSUPP;
if (sa->sa_family != dev->type)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
err = ops->ndo_set_mac_address(dev, sa);
if (err)
return err;
dev->addr_assign_type = NET_ADDR_SET;
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
return 0;
}
EXPORT_SYMBOL(dev_set_mac_address);
/**
* dev_change_carrier - Change device carrier
* @dev: device
* @new_carrier: new value
*
* Change device carrier
*/
int dev_change_carrier(struct net_device *dev, bool new_carrier)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (!ops->ndo_change_carrier)
return -EOPNOTSUPP;
if (!netif_device_present(dev))
return -ENODEV;
return ops->ndo_change_carrier(dev, new_carrier);
}
EXPORT_SYMBOL(dev_change_carrier);
/**
* dev_get_phys_port_id - Get device physical port ID
* @dev: device
* @ppid: port ID
*
* Get device physical port ID
*/
int dev_get_phys_port_id(struct net_device *dev,
struct netdev_phys_item_id *ppid)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (!ops->ndo_get_phys_port_id)
return -EOPNOTSUPP;
return ops->ndo_get_phys_port_id(dev, ppid);
}
EXPORT_SYMBOL(dev_get_phys_port_id);
/**
* dev_get_phys_port_name - Get device physical port name
* @dev: device
* @name: port name
* @len: limit of bytes to copy to name
*
* Get device physical port name
*/
int dev_get_phys_port_name(struct net_device *dev,
char *name, size_t len)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (!ops->ndo_get_phys_port_name)
return -EOPNOTSUPP;
return ops->ndo_get_phys_port_name(dev, name, len);
}
EXPORT_SYMBOL(dev_get_phys_port_name);
/**
* dev_change_proto_down - update protocol port state information
* @dev: device
* @proto_down: new value
*
* This info can be used by switch drivers to set the phys state of the
* port.
*/
int dev_change_proto_down(struct net_device *dev, bool proto_down)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (!ops->ndo_change_proto_down)
return -EOPNOTSUPP;
if (!netif_device_present(dev))
return -ENODEV;
return ops->ndo_change_proto_down(dev, proto_down);
}
EXPORT_SYMBOL(dev_change_proto_down);
/**
* dev_change_xdp_fd - set or clear a bpf program for a device rx path
* @dev: device
* @fd: new program fd or negative value to clear
*
* Set or clear a bpf program for a device
*/
int dev_change_xdp_fd(struct net_device *dev, int fd)
{
const struct net_device_ops *ops = dev->netdev_ops;
struct bpf_prog *prog = NULL;
struct netdev_xdp xdp = {};
int err;
if (!ops->ndo_xdp)
return -EOPNOTSUPP;
if (fd >= 0) {
prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
if (IS_ERR(prog))
return PTR_ERR(prog);
}
xdp.command = XDP_SETUP_PROG;
xdp.prog = prog;
err = ops->ndo_xdp(dev, &xdp);
if (err < 0 && prog)
bpf_prog_put(prog);
return err;
}
EXPORT_SYMBOL(dev_change_xdp_fd);
/**
* dev_new_index - allocate an ifindex
* @net: the applicable net namespace
*
* Returns a suitable unique value for a new device interface
* number. The caller must hold the rtnl semaphore or the
* dev_base_lock to be sure it remains unique.
*/
static int dev_new_index(struct net *net)
{
int ifindex = net->ifindex;
for (;;) {
if (++ifindex <= 0)
ifindex = 1;
if (!__dev_get_by_index(net, ifindex))
return net->ifindex = ifindex;
}
}
/* Delayed registration/unregisteration */
static LIST_HEAD(net_todo_list);
DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
static void net_set_todo(struct net_device *dev)
{
list_add_tail(&dev->todo_list, &net_todo_list);
dev_net(dev)->dev_unreg_count++;
}
static void rollback_registered_many(struct list_head *head)
{
struct net_device *dev, *tmp;
LIST_HEAD(close_head);
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
list_for_each_entry_safe(dev, tmp, head, unreg_list) {
/* Some devices call without registering
* for initialization unwind. Remove those
* devices and proceed with the remaining.
*/
if (dev->reg_state == NETREG_UNINITIALIZED) {
pr_debug("unregister_netdevice: device %s/%p never was registered\n",
dev->name, dev);
WARN_ON(1);
list_del(&dev->unreg_list);
continue;
}
dev->dismantle = true;
BUG_ON(dev->reg_state != NETREG_REGISTERED);
}
/* If device is running, close it first. */
list_for_each_entry(dev, head, unreg_list)
list_add_tail(&dev->close_list, &close_head);
dev_close_many(&close_head, true);
list_for_each_entry(dev, head, unreg_list) {
/* And unlink it from device chain. */
unlist_netdevice(dev);
dev->reg_state = NETREG_UNREGISTERING;
}
flush_all_backlogs();
synchronize_net();
list_for_each_entry(dev, head, unreg_list) {
struct sk_buff *skb = NULL;
/* Shutdown queueing discipline. */
dev_shutdown(dev);
/* Notify protocols, that we are about to destroy
this device. They should clean all the things.
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
if (!dev->rtnl_link_ops ||
dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
GFP_KERNEL);
/*
* Flush the unicast and multicast chains
*/
dev_uc_flush(dev);
dev_mc_flush(dev);
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
if (skb)
rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
/* Notifier chain MUST detach us all upper devices. */
WARN_ON(netdev_has_any_upper_dev(dev));
/* Remove entries from kobject tree */
netdev_unregister_kobject(dev);
#ifdef CONFIG_XPS
/* Remove XPS queueing entries */
netif_reset_xps_queues_gt(dev, 0);
#endif
}
synchronize_net();
list_for_each_entry(dev, head, unreg_list)
dev_put(dev);
}
static void rollback_registered(struct net_device *dev)
{
LIST_HEAD(single);
list_add(&dev->unreg_list, &single);
rollback_registered_many(&single);
list_del(&single);
}
static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
struct net_device *upper, netdev_features_t features)
{
netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
netdev_features_t feature;
int feature_bit;
for_each_netdev_feature(&upper_disables, feature_bit) {
feature = __NETIF_F_BIT(feature_bit);
if (!(upper->wanted_features & feature)
&& (features & feature)) {
netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
&feature, upper->name);
features &= ~feature;
}
}
return features;
}
static void netdev_sync_lower_features(struct net_device *upper,
struct net_device *lower, netdev_features_t features)
{
netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
netdev_features_t feature;
int feature_bit;
for_each_netdev_feature(&upper_disables, feature_bit) {
feature = __NETIF_F_BIT(feature_bit);
if (!(features & feature) && (lower->features & feature)) {
netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
&feature, lower->name);
lower->wanted_features &= ~feature;
netdev_update_features(lower);
if (unlikely(lower->features & feature))
netdev_WARN(upper, "failed to disable %pNF on %s!\n",
&feature, lower->name);
}
}
}
static netdev_features_t netdev_fix_features(struct net_device *dev,
netdev_features_t features)
{
/* Fix illegal checksum combinations */
if ((features & NETIF_F_HW_CSUM) &&
(features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
netdev_warn(dev, "mixed HW and IP checksum settings.\n");
features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
}
/* TSO requires that SG is present as well. */
if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
features &= ~NETIF_F_ALL_TSO;
}
if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
!(features & NETIF_F_IP_CSUM)) {
netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
features &= ~NETIF_F_TSO;
features &= ~NETIF_F_TSO_ECN;
}
if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
!(features & NETIF_F_IPV6_CSUM)) {
netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
features &= ~NETIF_F_TSO6;
}
/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
features &= ~NETIF_F_TSO_MANGLEID;
/* TSO ECN requires that TSO is present as well. */
if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
features &= ~NETIF_F_TSO_ECN;
/* Software GSO depends on SG. */
if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
features &= ~NETIF_F_GSO;
}
/* UFO needs SG and checksumming */
if (features & NETIF_F_UFO) {
/* maybe split UFO into V4 and V6? */
if (!(features & NETIF_F_HW_CSUM) &&
((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
netdev_dbg(dev,
"Dropping NETIF_F_UFO since no checksum offload features.\n");
features &= ~NETIF_F_UFO;
}
if (!(features & NETIF_F_SG)) {
netdev_dbg(dev,
"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
features &= ~NETIF_F_UFO;
}
}
/* GSO partial features require GSO partial be set */
if ((features & dev->gso_partial_features) &&
!(features & NETIF_F_GSO_PARTIAL)) {
netdev_dbg(dev,
"Dropping partially supported GSO features since no GSO partial.\n");
features &= ~dev->gso_partial_features;
}
#ifdef CONFIG_NET_RX_BUSY_POLL
if (dev->netdev_ops->ndo_busy_poll)
features |= NETIF_F_BUSY_POLL;
else
#endif
features &= ~NETIF_F_BUSY_POLL;
return features;
}
int __netdev_update_features(struct net_device *dev)
{
struct net_device *upper, *lower;
netdev_features_t features;
struct list_head *iter;
int err = -1;
ASSERT_RTNL();
features = netdev_get_wanted_features(dev);
if (dev->netdev_ops->ndo_fix_features)
features = dev->netdev_ops->ndo_fix_features(dev, features);
/* driver might be less strict about feature dependencies */
features = netdev_fix_features(dev, features);
/* some features can't be enabled if they're off an an upper device */
netdev_for_each_upper_dev_rcu(dev, upper, iter)
features = netdev_sync_upper_features(dev, upper, features);
if (dev->features == features)
goto sync_lower;
netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
&dev->features, &features);
if (dev->netdev_ops->ndo_set_features)
err = dev->netdev_ops->ndo_set_features(dev, features);
else
err = 0;
if (unlikely(err < 0)) {
netdev_err(dev,
"set_features() failed (%d); wanted %pNF, left %pNF\n",
err, &features, &dev->features);
/* return non-0 since some features might have changed and
* it's better to fire a spurious notification than miss it
*/
return -1;
}
sync_lower:
/* some features must be disabled on lower devices when disabled
* on an upper device (think: bonding master or bridge)
*/
netdev_for_each_lower_dev(dev, lower, iter)
netdev_sync_lower_features(dev, lower, features);
if (!err)
dev->features = features;
return err < 0 ? 0 : 1;
}
/**
* netdev_update_features - recalculate device features
* @dev: the device to check
*
* Recalculate dev->features set and send notifications if it
* has changed. Should be called after driver or hardware dependent
* conditions might have changed that influence the features.
*/
void netdev_update_features(struct net_device *dev)
{
if (__netdev_update_features(dev))
netdev_features_change(dev);
}
EXPORT_SYMBOL(netdev_update_features);
/**
* netdev_change_features - recalculate device features
* @dev: the device to check
*
* Recalculate dev->features set and send notifications even
* if they have not changed. Should be called instead of
* netdev_update_features() if also dev->vlan_features might
* have changed to allow the changes to be propagated to stacked
* VLAN devices.
*/
void netdev_change_features(struct net_device *dev)
{
__netdev_update_features(dev);
netdev_features_change(dev);
}
EXPORT_SYMBOL(netdev_change_features);
/**
* netif_stacked_transfer_operstate - transfer operstate
* @rootdev: the root or lower level device to transfer state from
* @dev: the device to transfer operstate to
*
* Transfer operational state from root to device. This is normally
* called when a stacking relationship exists between the root
* device and the device(a leaf device).
*/
void netif_stacked_transfer_operstate(const struct net_device *rootdev,
struct net_device *dev)
{
if (rootdev->operstate == IF_OPER_DORMANT)
netif_dormant_on(dev);
else
netif_dormant_off(dev);
if (netif_carrier_ok(rootdev)) {
if (!netif_carrier_ok(dev))
netif_carrier_on(dev);
} else {
if (netif_carrier_ok(dev))
netif_carrier_off(dev);
}
}
EXPORT_SYMBOL(netif_stacked_transfer_operstate);
#ifdef CONFIG_SYSFS
static int netif_alloc_rx_queues(struct net_device *dev)
{
unsigned int i, count = dev->num_rx_queues;
struct netdev_rx_queue *rx;
size_t sz = count * sizeof(*rx);
BUG_ON(count < 1);
rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
if (!rx) {
rx = vzalloc(sz);
if (!rx)
return -ENOMEM;
}
dev->_rx = rx;
for (i = 0; i < count; i++)
rx[i].dev = dev;
return 0;
}
#endif
static void netdev_init_one_queue(struct net_device *dev,
struct netdev_queue *queue, void *_unused)
{
/* Initialize queue lock */
spin_lock_init(&queue->_xmit_lock);
netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
queue->xmit_lock_owner = -1;
netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
queue->dev = dev;
#ifdef CONFIG_BQL
dql_init(&queue->dql, HZ);
#endif
}
static void netif_free_tx_queues(struct net_device *dev)
{
kvfree(dev->_tx);
}
static int netif_alloc_netdev_queues(struct net_device *dev)
{
unsigned int count = dev->num_tx_queues;
struct netdev_queue *tx;
size_t sz = count * sizeof(*tx);
if (count < 1 || count > 0xffff)
return -EINVAL;
tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
if (!tx) {
tx = vzalloc(sz);
if (!tx)
return -ENOMEM;
}
dev->_tx = tx;
netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
spin_lock_init(&dev->tx_global_lock);
return 0;
}
void netif_tx_stop_all_queues(struct net_device *dev)
{
unsigned int i;
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
netif_tx_stop_queue(txq);
}
}
EXPORT_SYMBOL(netif_tx_stop_all_queues);
/**
* register_netdevice - register a network device
* @dev: device to register
*
* Take a completed network device structure and add it to the kernel
* interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
* chain. 0 is returned on success. A negative errno code is returned
* on a failure to set up the device, or if the name is a duplicate.
*
* Callers must hold the rtnl semaphore. You may want
* register_netdev() instead of this.
*
* BUGS:
* The locking appears insufficient to guarantee two parallel registers
* will not get the same name.
*/
int register_netdevice(struct net_device *dev)
{
int ret;
struct net *net = dev_net(dev);
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
might_sleep();
/* When net_device's are persistent, this will be fatal. */
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
BUG_ON(!net);
spin_lock_init(&dev->addr_list_lock);
netdev_set_addr_lockdep_class(dev);
ret = dev_get_valid_name(net, dev, dev->name);
if (ret < 0)
goto out;
/* Init, if this function is available */
if (dev->netdev_ops->ndo_init) {
ret = dev->netdev_ops->ndo_init(dev);
if (ret) {
if (ret > 0)
ret = -EIO;
goto out;
}
}
if (((dev->hw_features | dev->features) &
NETIF_F_HW_VLAN_CTAG_FILTER) &&
(!dev->netdev_ops->ndo_vlan_rx_add_vid ||
!dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
ret = -EINVAL;
goto err_uninit;
}
ret = -EBUSY;
if (!dev->ifindex)
dev->ifindex = dev_new_index(net);
else if (__dev_get_by_index(net, dev->ifindex))
goto err_uninit;
/* Transfer changeable features to wanted_features and enable
* software offloads (GSO and GRO).
*/
dev->hw_features |= NETIF_F_SOFT_FEATURES;
dev->features |= NETIF_F_SOFT_FEATURES;
dev->wanted_features = dev->features & dev->hw_features;
if (!(dev->flags & IFF_LOOPBACK))
dev->hw_features |= NETIF_F_NOCACHE_COPY;
/* If IPv4 TCP segmentation offload is supported we should also
* allow the device to enable segmenting the frame with the option
* of ignoring a static IP ID value. This doesn't enable the
* feature itself but allows the user to enable it later.
*/
if (dev->hw_features & NETIF_F_TSO)
dev->hw_features |= NETIF_F_TSO_MANGLEID;
if (dev->vlan_features & NETIF_F_TSO)
dev->vlan_features |= NETIF_F_TSO_MANGLEID;
if (dev->mpls_features & NETIF_F_TSO)
dev->mpls_features |= NETIF_F_TSO_MANGLEID;
if (dev->hw_enc_features & NETIF_F_TSO)
dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
*/
dev->vlan_features |= NETIF_F_HIGHDMA;
/* Make NETIF_F_SG inheritable to tunnel devices.
*/
dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
/* Make NETIF_F_SG inheritable to MPLS.
*/
dev->mpls_features |= NETIF_F_SG;
ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
ret = notifier_to_errno(ret);
if (ret)
goto err_uninit;
ret = netdev_register_kobject(dev);
if (ret)
goto err_uninit;
dev->reg_state = NETREG_REGISTERED;
__netdev_update_features(dev);
/*
* Default initial state at registry is that the
* device is present.
*/
set_bit(__LINK_STATE_PRESENT, &dev->state);
linkwatch_init_dev(dev);
dev_init_scheduler(dev);
dev_hold(dev);
list_netdevice(dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
/* If the device has permanent device address, driver should
* set dev_addr and also addr_assign_type should be set to
* NET_ADDR_PERM (default value).
*/
if (dev->addr_assign_type == NET_ADDR_PERM)
memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
/* Notify protocols, that a new device appeared. */
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
ret = notifier_to_errno(ret);
if (ret) {
rollback_registered(dev);
dev->reg_state = NETREG_UNREGISTERED;
}
/*
* Prevent userspace races by waiting until the network
* device is fully setup before sending notifications.
*/
if (!dev->rtnl_link_ops ||
dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
out:
return ret;
err_uninit:
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
goto out;
}
EXPORT_SYMBOL(register_netdevice);
/**
* init_dummy_netdev - init a dummy network device for NAPI
* @dev: device to init
*
* This takes a network device structure and initialize the minimum
* amount of fields so it can be used to schedule NAPI polls without
* registering a full blown interface. This is to be used by drivers
* that need to tie several hardware interfaces to a single NAPI
* poll scheduler due to HW limitations.
*/
int init_dummy_netdev(struct net_device *dev)
{
/* Clear everything. Note we don't initialize spinlocks
* are they aren't supposed to be taken by any of the
* NAPI code and this dummy netdev is supposed to be
* only ever used for NAPI polls
*/
memset(dev, 0, sizeof(struct net_device));
/* make sure we BUG if trying to hit standard
* register/unregister code path
*/
dev->reg_state = NETREG_DUMMY;
/* NAPI wants this */
INIT_LIST_HEAD(&dev->napi_list);
/* a dummy interface is started by default */
set_bit(__LINK_STATE_PRESENT, &dev->state);
set_bit(__LINK_STATE_START, &dev->state);
/* Note : We dont allocate pcpu_refcnt for dummy devices,
* because users of this 'device' dont need to change
* its refcount.
*/
return 0;
}
EXPORT_SYMBOL_GPL(init_dummy_netdev);
/**
* register_netdev - register a network device
* @dev: device to register
*
* Take a completed network device structure and add it to the kernel
* interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
* chain. 0 is returned on success. A negative errno code is returned
* on a failure to set up the device, or if the name is a duplicate.
*
* This is a wrapper around register_netdevice that takes the rtnl semaphore
* and expands the device name if you passed a format string to
* alloc_netdev.
*/
int register_netdev(struct net_device *dev)
{
int err;
rtnl_lock();
err = register_netdevice(dev);
rtnl_unlock();
return err;
}
EXPORT_SYMBOL(register_netdev);
int netdev_refcnt_read(const struct net_device *dev)
{
int i, refcnt = 0;
for_each_possible_cpu(i)
refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
return refcnt;
}
EXPORT_SYMBOL(netdev_refcnt_read);
/**
* netdev_wait_allrefs - wait until all references are gone.
* @dev: target net_device
*
* This is called when unregistering network devices.
*
* Any protocol or device that holds a reference should register
* for netdevice notification, and cleanup and put back the
* reference if they receive an UNREGISTER event.
* We can get stuck here if buggy protocols don't correctly
* call dev_put.
*/
static void netdev_wait_allrefs(struct net_device *dev)
{
unsigned long rebroadcast_time, warning_time;
int refcnt;
linkwatch_forget_dev(dev);
rebroadcast_time = warning_time = jiffies;
refcnt = netdev_refcnt_read(dev);
while (refcnt != 0) {
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
rtnl_lock();
/* Rebroadcast unregister notification */
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
__rtnl_unlock();
rcu_barrier();
rtnl_lock();
call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
&dev->state)) {
/* We must not have linkwatch events
* pending on unregister. If this
* happens, we simply run the queue
* unscheduled, resulting in a noop
* for this device.
*/
linkwatch_run_queue();
}
__rtnl_unlock();
rebroadcast_time = jiffies;
}
msleep(250);
refcnt = netdev_refcnt_read(dev);
if (time_after(jiffies, warning_time + 10 * HZ)) {
pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
dev->name, refcnt);
warning_time = jiffies;
}
}
}
/* The sequence is:
*
* rtnl_lock();
* ...
* register_netdevice(x1);
* register_netdevice(x2);
* ...
* unregister_netdevice(y1);
* unregister_netdevice(y2);
* ...
* rtnl_unlock();
* free_netdev(y1);
* free_netdev(y2);
*
* We are invoked by rtnl_unlock().
* This allows us to deal with problems:
* 1) We can delete sysfs objects which invoke hotplug
* without deadlocking with linkwatch via keventd.
* 2) Since we run with the RTNL semaphore not held, we can sleep
* safely in order to wait for the netdev refcnt to drop to zero.
*
* We must not return until all unregister events added during
* the interval the lock was held have been completed.
*/
void netdev_run_todo(void)
{
struct list_head list;
/* Snapshot list, allow later requests */
list_replace_init(&net_todo_list, &list);
__rtnl_unlock();
/* Wait for rcu callbacks to finish before next phase */
if (!list_empty(&list))
rcu_barrier();
while (!list_empty(&list)) {
struct net_device *dev
= list_first_entry(&list, struct net_device, todo_list);
list_del(&dev->todo_list);
rtnl_lock();
call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
__rtnl_unlock();
if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
pr_err("network todo '%s' but state %d\n",
dev->name, dev->reg_state);
dump_stack();
continue;
}
dev->reg_state = NETREG_UNREGISTERED;
netdev_wait_allrefs(dev);
/* paranoia */
BUG_ON(netdev_refcnt_read(dev));
BUG_ON(!list_empty(&dev->ptype_all));
BUG_ON(!list_empty(&dev->ptype_specific));
WARN_ON(rcu_access_pointer(dev->ip_ptr));
WARN_ON(rcu_access_pointer(dev->ip6_ptr));
WARN_ON(dev->dn_ptr);
if (dev->destructor)
dev->destructor(dev);
/* Report a network device has been unregistered */
rtnl_lock();
dev_net(dev)->dev_unreg_count--;
__rtnl_unlock();
wake_up(&netdev_unregistering_wq);
/* Free network device */
kobject_put(&dev->dev.kobj);
}
}
/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
* all the same fields in the same order as net_device_stats, with only
* the type differing, but rtnl_link_stats64 may have additional fields
* at the end for newer counters.
*/
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
const struct net_device_stats *netdev_stats)
{
#if BITS_PER_LONG == 64
BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
memcpy(stats64, netdev_stats, sizeof(*stats64));
/* zero out counters that only exist in rtnl_link_stats64 */
memset((char *)stats64 + sizeof(*netdev_stats), 0,
sizeof(*stats64) - sizeof(*netdev_stats));
#else
size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
const unsigned long *src = (const unsigned long *)netdev_stats;
u64 *dst = (u64 *)stats64;
BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
for (i = 0; i < n; i++)
dst[i] = src[i];
/* zero out counters that only exist in rtnl_link_stats64 */
memset((char *)stats64 + n * sizeof(u64), 0,
sizeof(*stats64) - n * sizeof(u64));
#endif
}
EXPORT_SYMBOL(netdev_stats_to_stats64);
/**
* dev_get_stats - get network device statistics
* @dev: device to get statistics from
* @storage: place to store stats
*
* Get network statistics from device. Return @storage.
* The device driver may provide its own method by setting
* dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
* otherwise the internal statistics structure is used.
*/
struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
struct rtnl_link_stats64 *storage)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_get_stats64) {
memset(storage, 0, sizeof(*storage));
ops->ndo_get_stats64(dev, storage);
} else if (ops->ndo_get_stats) {
netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
} else {
netdev_stats_to_stats64(storage, &dev->stats);
}
storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
return storage;
}
EXPORT_SYMBOL(dev_get_stats);
struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
{
struct netdev_queue *queue = dev_ingress_queue(dev);
#ifdef CONFIG_NET_CLS_ACT
if (queue)
return queue;
queue = kzalloc(sizeof(*queue), GFP_KERNEL);
if (!queue)
return NULL;
netdev_init_one_queue(dev, queue, NULL);
RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
queue->qdisc_sleeping = &noop_qdisc;
rcu_assign_pointer(dev->ingress_queue, queue);
#endif
return queue;
}
static const struct ethtool_ops default_ethtool_ops;
void netdev_set_default_ethtool_ops(struct net_device *dev,
const struct ethtool_ops *ops)
{
if (dev->ethtool_ops == &default_ethtool_ops)
dev->ethtool_ops = ops;
}
EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
void netdev_freemem(struct net_device *dev)
{
char *addr = (char *)dev - dev->padded;
kvfree(addr);
}
/**
* alloc_netdev_mqs - allocate network device
* @sizeof_priv: size of private data to allocate space for
* @name: device name format string
* @name_assign_type: origin of device name
* @setup: callback to initialize device
* @txqs: the number of TX subqueues to allocate
* @rxqs: the number of RX subqueues to allocate
*
* Allocates a struct net_device with private data area for driver use
* and performs basic initialization. Also allocates subqueue structs
* for each queue on the device.
*/
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
unsigned char name_assign_type,
void (*setup)(struct net_device *),
unsigned int txqs, unsigned int rxqs)
{
struct net_device *dev;
size_t alloc_size;
struct net_device *p;
BUG_ON(strlen(name) >= sizeof(dev->name));
if (txqs < 1) {
pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
return NULL;
}
#ifdef CONFIG_SYSFS
if (rxqs < 1) {
pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
return NULL;
}
#endif
alloc_size = sizeof(struct net_device);
if (sizeof_priv) {
/* ensure 32-byte alignment of private area */
alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
alloc_size += sizeof_priv;
}
/* ensure 32-byte alignment of whole construct */
alloc_size += NETDEV_ALIGN - 1;
p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
if (!p)
p = vzalloc(alloc_size);
if (!p)
return NULL;
dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p;
dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
goto free_dev;
if (dev_addr_init(dev))
goto free_pcpu;
dev_mc_init(dev);
dev_uc_init(dev);
dev_net_set(dev, &init_net);
dev->gso_max_size = GSO_MAX_SIZE;
dev->gso_max_segs = GSO_MAX_SEGS;
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
INIT_LIST_HEAD(&dev->close_list);
INIT_LIST_HEAD(&dev->link_watch_list);
INIT_LIST_HEAD(&dev->adj_list.upper);
INIT_LIST_HEAD(&dev->adj_list.lower);
INIT_LIST_HEAD(&dev->all_adj_list.upper);
INIT_LIST_HEAD(&dev->all_adj_list.lower);
INIT_LIST_HEAD(&dev->ptype_all);
INIT_LIST_HEAD(&dev->ptype_specific);
#ifdef CONFIG_NET_SCHED
hash_init(dev->qdisc_hash);
#endif
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
if (!dev->tx_queue_len) {
dev->priv_flags |= IFF_NO_QUEUE;
dev->tx_queue_len = 1;
}
dev->num_tx_queues = txqs;
dev->real_num_tx_queues = txqs;
if (netif_alloc_netdev_queues(dev))
goto free_all;
#ifdef CONFIG_SYSFS
dev->num_rx_queues = rxqs;
dev->real_num_rx_queues = rxqs;
if (netif_alloc_rx_queues(dev))
goto free_all;
#endif
strcpy(dev->name, name);
dev->name_assign_type = name_assign_type;
dev->group = INIT_NETDEV_GROUP;
if (!dev->ethtool_ops)
dev->ethtool_ops = &default_ethtool_ops;
nf_hook_ingress_init(dev);
return dev;
free_all:
free_netdev(dev);
return NULL;
free_pcpu:
free_percpu(dev->pcpu_refcnt);
free_dev:
netdev_freemem(dev);
return NULL;
}
EXPORT_SYMBOL(alloc_netdev_mqs);
/**
* free_netdev - free network device
* @dev: device
*
* This function does the last stage of destroying an allocated device
* interface. The reference to the device object is released.
* If this is the last reference then it will be freed.
* Must be called in process context.
*/
void free_netdev(struct net_device *dev)
{
struct napi_struct *p, *n;
might_sleep();
netif_free_tx_queues(dev);
#ifdef CONFIG_SYSFS
kvfree(dev->_rx);
#endif
kfree(rcu_dereference_protected(dev->ingress_queue, 1));
/* Flush device addresses */
dev_addr_flush(dev);
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
free_percpu(dev->pcpu_refcnt);
dev->pcpu_refcnt = NULL;
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
netdev_freemem(dev);
return;
}
BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
dev->reg_state = NETREG_RELEASED;
/* will free via device release */
put_device(&dev->dev);
}
EXPORT_SYMBOL(free_netdev);
/**
* synchronize_net - Synchronize with packet receive processing
*
* Wait for packets currently being received to be done.
* Does not block later packets from starting.
*/
void synchronize_net(void)
{
might_sleep();
if (rtnl_is_locked())
synchronize_rcu_expedited();
else
synchronize_rcu();
}
EXPORT_SYMBOL(synchronize_net);
/**
* unregister_netdevice_queue - remove device from the kernel
* @dev: device
* @head: list
*
* This function shuts down a device interface and removes it
* from the kernel tables.
* If head not NULL, device is queued to be unregistered later.
*
* Callers must hold the rtnl semaphore. You may want
* unregister_netdev() instead of this.
*/
void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
{
ASSERT_RTNL();
if (head) {
list_move_tail(&dev->unreg_list, head);
} else {
rollback_registered(dev);
/* Finish processing unregister after unlock */
net_set_todo(dev);
}
}
EXPORT_SYMBOL(unregister_netdevice_queue);
/**
* unregister_netdevice_many - unregister many devices
* @head: list of devices
*
* Note: As most callers use a stack allocated list_head,
* we force a list_del() to make sure stack wont be corrupted later.
*/
void unregister_netdevice_many(struct list_head *head)
{
struct net_device *dev;
if (!list_empty(head)) {
rollback_registered_many(head);
list_for_each_entry(dev, head, unreg_list)
net_set_todo(dev);
list_del(head);
}
}
EXPORT_SYMBOL(unregister_netdevice_many);
/**
* unregister_netdev - remove device from the kernel
* @dev: device
*
* This function shuts down a device interface and removes it
* from the kernel tables.
*
* This is just a wrapper for unregister_netdevice that takes
* the rtnl semaphore. In general you want to use this and not
* unregister_netdevice.
*/
void unregister_netdev(struct net_device *dev)
{
rtnl_lock();
unregister_netdevice(dev);
rtnl_unlock();
}
EXPORT_SYMBOL(unregister_netdev);
/**
* dev_change_net_namespace - move device to different nethost namespace
* @dev: device
* @net: network namespace
* @pat: If not NULL name pattern to try if the current device name
* is already taken in the destination network namespace.
*
* This function shuts down a device interface and moves it
* to a new network namespace. On success 0 is returned, on
* a failure a netagive errno code is returned.
*
* Callers must hold the rtnl semaphore.
*/
int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
{
int err;
ASSERT_RTNL();
/* Don't allow namespace local devices to be moved. */
err = -EINVAL;
if (dev->features & NETIF_F_NETNS_LOCAL)
goto out;
/* Ensure the device has been registrered */
if (dev->reg_state != NETREG_REGISTERED)
goto out;
/* Get out if there is nothing todo */
err = 0;
if (net_eq(dev_net(dev), net))
goto out;
/* Pick the destination device name, and ensure
* we can use it in the destination network namespace.
*/
err = -EEXIST;
if (__dev_get_by_name(net, dev->name)) {
/* We get here if we can't use the current device name */
if (!pat)
goto out;
if (dev_get_valid_name(net, dev, pat) < 0)
goto out;
}
/*
* And now a mini version of register_netdevice unregister_netdevice.
*/
/* If device is running close it first. */
dev_close(dev);
/* And unlink it from device chain */
err = -ENODEV;
unlist_netdevice(dev);
synchronize_net();
/* Shutdown queueing discipline. */
dev_shutdown(dev);
/* Notify protocols, that we are about to destroy
this device. They should clean all the things.
Note that dev->reg_state stays at NETREG_REGISTERED.
This is wanted because this way 8021q and macvlan know
the device is just moving and can keep their slaves up.
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
rcu_barrier();
call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
/*
* Flush the unicast and multicast chains
*/
dev_uc_flush(dev);
dev_mc_flush(dev);
/* Send a netdev-removed uevent to the old namespace */
kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
netdev_adjacent_del_links(dev);
/* Actually switch the network namespace */
dev_net_set(dev, net);
/* If there is an ifindex conflict assign a new one */
if (__dev_get_by_index(net, dev->ifindex))
dev->ifindex = dev_new_index(net);
/* Send a netdev-add uevent to the new namespace */
kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
netdev_adjacent_add_links(dev);
/* Fixup kobjects */
err = device_rename(&dev->dev, dev->name);
WARN_ON(err);
/* Add the device back in the hashes */
list_netdevice(dev);
/* Notify protocols, that a new device appeared. */
call_netdevice_notifiers(NETDEV_REGISTER, dev);
/*
* Prevent userspace races by waiting until the network
* device is fully setup before sending notifications.
*/
rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
synchronize_net();
err = 0;
out:
return err;
}
EXPORT_SYMBOL_GPL(dev_change_net_namespace);
static int dev_cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *ocpu)
{
struct sk_buff **list_skb;
struct sk_buff *skb;
unsigned int cpu, oldcpu = (unsigned long)ocpu;
struct softnet_data *sd, *oldsd;
if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
return NOTIFY_OK;
local_irq_disable();
cpu = smp_processor_id();
sd = &per_cpu(softnet_data, cpu);
oldsd = &per_cpu(softnet_data, oldcpu);
/* Find end of our completion_queue. */
list_skb = &sd->completion_queue;
while (*list_skb)
list_skb = &(*list_skb)->next;
/* Append completion queue from offline CPU. */
*list_skb = oldsd->completion_queue;
oldsd->completion_queue = NULL;
/* Append output queue from offline CPU. */
if (oldsd->output_queue) {
*sd->output_queue_tailp = oldsd->output_queue;
sd->output_queue_tailp = oldsd->output_queue_tailp;
oldsd->output_queue = NULL;
oldsd->output_queue_tailp = &oldsd->output_queue;
}
/* Append NAPI poll list from offline CPU, with one exception :
* process_backlog() must be called by cpu owning percpu backlog.
* We properly handle process_queue & input_pkt_queue later.
*/
while (!list_empty(&oldsd->poll_list)) {
struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
struct napi_struct,
poll_list);
list_del_init(&napi->poll_list);
if (napi->poll == process_backlog)
napi->state = 0;
else
____napi_schedule(sd, napi);
}
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
/* Process offline CPU's input_pkt_queue */
while ((skb = __skb_dequeue(&oldsd->process_queue))) {
netif_rx_ni(skb);
input_queue_head_incr(oldsd);
}
while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
netif_rx_ni(skb);
input_queue_head_incr(oldsd);
}
return NOTIFY_OK;
}
/**
* netdev_increment_features - increment feature set by one
* @all: current feature set
* @one: new feature set
* @mask: mask feature set
*
* Computes a new feature set after adding a device with feature set
* @one to the master device with current feature set @all. Will not
* enable anything that is off in @mask. Returns the new feature set.
*/
netdev_features_t netdev_increment_features(netdev_features_t all,
netdev_features_t one, netdev_features_t mask)
{
if (mask & NETIF_F_HW_CSUM)
mask |= NETIF_F_CSUM_MASK;
mask |= NETIF_F_VLAN_CHALLENGED;
all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
all &= one | ~NETIF_F_ALL_FOR_ALL;
/* If one device supports hw checksumming, set for all. */
if (all & NETIF_F_HW_CSUM)
all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
return all;
}
EXPORT_SYMBOL(netdev_increment_features);
static struct hlist_head * __net_init netdev_create_hash(void)
{
int i;
struct hlist_head *hash;
hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
if (hash != NULL)
for (i = 0; i < NETDEV_HASHENTRIES; i++)
INIT_HLIST_HEAD(&hash[i]);
return hash;
}
/* Initialize per network namespace state */
static int __net_init netdev_init(struct net *net)
{
if (net != &init_net)
INIT_LIST_HEAD(&net->dev_base_head);
net->dev_name_head = netdev_create_hash();
if (net->dev_name_head == NULL)
goto err_name;
net->dev_index_head = netdev_create_hash();
if (net->dev_index_head == NULL)
goto err_idx;
return 0;
err_idx:
kfree(net->dev_name_head);
err_name:
return -ENOMEM;
}
/**
* netdev_drivername - network driver for the device
* @dev: network device
*
* Determine network driver for device.
*/
const char *netdev_drivername(const struct net_device *dev)
{
const struct device_driver *driver;
const struct device *parent;
const char *empty = "";
parent = dev->dev.parent;
if (!parent)
return empty;
driver = parent->driver;
if (driver && driver->name)
return driver->name;
return empty;
}
static void __netdev_printk(const char *level, const struct net_device *dev,
struct va_format *vaf)
{
if (dev && dev->dev.parent) {
dev_printk_emit(level[1] - '0',
dev->dev.parent,
"%s %s %s%s: %pV",
dev_driver_string(dev->dev.parent),
dev_name(dev->dev.parent),
netdev_name(dev), netdev_reg_state(dev),
vaf);
} else if (dev) {
printk("%s%s%s: %pV",
level, netdev_name(dev), netdev_reg_state(dev), vaf);
} else {
printk("%s(NULL net_device): %pV", level, vaf);
}
}
void netdev_printk(const char *level, const struct net_device *dev,
const char *format, ...)
{
struct va_format vaf;
va_list args;
va_start(args, format);
vaf.fmt = format;
vaf.va = &args;
__netdev_printk(level, dev, &vaf);
va_end(args);
}
EXPORT_SYMBOL(netdev_printk);
#define define_netdev_printk_level(func, level) \
void func(const struct net_device *dev, const char *fmt, ...) \
{ \
struct va_format vaf; \
va_list args; \
\
va_start(args, fmt); \
\
vaf.fmt = fmt; \
vaf.va = &args; \
\
__netdev_printk(level, dev, &vaf); \
\
va_end(args); \
} \
EXPORT_SYMBOL(func);
define_netdev_printk_level(netdev_emerg, KERN_EMERG);
define_netdev_printk_level(netdev_alert, KERN_ALERT);
define_netdev_printk_level(netdev_crit, KERN_CRIT);
define_netdev_printk_level(netdev_err, KERN_ERR);
define_netdev_printk_level(netdev_warn, KERN_WARNING);
define_netdev_printk_level(netdev_notice, KERN_NOTICE);
define_netdev_printk_level(netdev_info, KERN_INFO);
static void __net_exit netdev_exit(struct net *net)
{
kfree(net->dev_name_head);
kfree(net->dev_index_head);
}
static struct pernet_operations __net_initdata netdev_net_ops = {
.init = netdev_init,
.exit = netdev_exit,
};
static void __net_exit default_device_exit(struct net *net)
{
struct net_device *dev, *aux;
/*
* Push all migratable network devices back to the
* initial network namespace
*/
rtnl_lock();
for_each_netdev_safe(net, dev, aux) {
int err;
char fb_name[IFNAMSIZ];
/* Ignore unmoveable devices (i.e. loopback) */
if (dev->features & NETIF_F_NETNS_LOCAL)
continue;
/* Leave virtual devices for the generic cleanup */
if (dev->rtnl_link_ops)
continue;
/* Push remaining network devices to init_net */
snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
err = dev_change_net_namespace(dev, &init_net, fb_name);
if (err) {
pr_emerg("%s: failed to move %s to init_net: %d\n",
__func__, dev->name, err);
BUG();
}
}
rtnl_unlock();
}
static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
{
/* Return with the rtnl_lock held when there are no network
* devices unregistering in any network namespace in net_list.
*/
struct net *net;
bool unregistering;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
add_wait_queue(&netdev_unregistering_wq, &wait);
for (;;) {
unregistering = false;
rtnl_lock();
list_for_each_entry(net, net_list, exit_list) {
if (net->dev_unreg_count > 0) {
unregistering = true;
break;
}
}
if (!unregistering)
break;
__rtnl_unlock();
wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
remove_wait_queue(&netdev_unregistering_wq, &wait);
}
static void __net_exit default_device_exit_batch(struct list_head *net_list)
{
/* At exit all network devices most be removed from a network
* namespace. Do this in the reverse order of registration.
* Do this across as many network namespaces as possible to
* improve batching efficiency.
*/
struct net_device *dev;
struct net *net;
LIST_HEAD(dev_kill_list);
/* To prevent network device cleanup code from dereferencing
* loopback devices or network devices that have been freed
* wait here for all pending unregistrations to complete,
* before unregistring the loopback device and allowing the
* network namespace be freed.
*
* The netdev todo list containing all network devices
* unregistrations that happen in default_device_exit_batch
* will run in the rtnl_unlock() at the end of
* default_device_exit_batch.
*/
rtnl_lock_unregistering(net_list);
list_for_each_entry(net, net_list, exit_list) {
for_each_netdev_reverse(net, dev) {
if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
else
unregister_netdevice_queue(dev, &dev_kill_list);
}
}
unregister_netdevice_many(&dev_kill_list);
rtnl_unlock();
}
static struct pernet_operations __net_initdata default_device_ops = {
.exit = default_device_exit,
.exit_batch = default_device_exit_batch,
};
/*
* Initialize the DEV module. At boot time this walks the device list and
* unhooks any devices that fail to initialise (normally hardware not
* present) and leaves us with a valid list of present and active devices.
*
*/
/*
* This is called single threaded during boot, so no need
* to take the rtnl semaphore.
*/
static int __init net_dev_init(void)
{
int i, rc = -ENOMEM;
BUG_ON(!dev_boot_phase);
if (dev_proc_init())
goto out;
if (netdev_kobject_init())
goto out;
INIT_LIST_HEAD(&ptype_all);
for (i = 0; i < PTYPE_HASH_SIZE; i++)
INIT_LIST_HEAD(&ptype_base[i]);
INIT_LIST_HEAD(&offload_base);
if (register_pernet_subsys(&netdev_net_ops))
goto out;
/*
* Initialise the packet receive queues.
*/
for_each_possible_cpu(i) {
struct work_struct *flush = per_cpu_ptr(&flush_works, i);
struct softnet_data *sd = &per_cpu(softnet_data, i);
INIT_WORK(flush, flush_backlog);
skb_queue_head_init(&sd->input_pkt_queue);
skb_queue_head_init(&sd->process_queue);
INIT_LIST_HEAD(&sd->poll_list);
sd->output_queue_tailp = &sd->output_queue;
#ifdef CONFIG_RPS
sd->csd.func = rps_trigger_softirq;
sd->csd.info = sd;
sd->cpu = i;
#endif
sd->backlog.poll = process_backlog;
sd->backlog.weight = weight_p;
}
dev_boot_phase = 0;
/* The loopback device is special if any other network devices
* is present in a network namespace the loopback device must
* be present. Since we now dynamically allocate and free the
* loopback device ensure this invariant is maintained by
* keeping the loopback device as the first device on the
* list of network devices. Ensuring the loopback devices
* is the first device that appears and the last network device
* that disappears.
*/
if (register_pernet_device(&loopback_net_ops))
goto out;
if (register_pernet_device(&default_device_ops))
goto out;
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
hotcpu_notifier(dev_cpu_callback, 0);
dst_subsys_init();
rc = 0;
out:
return rc;
}
subsys_initcall(net_dev_init);