Files
Blagovest Kolenichev e06a1054bd Merge android-4.9.84 (a9d0273) into msm-4.9
* refs/heads/tmp-a9d0273:
  Linux 4.9.84
  crypto: s5p-sss - Fix kernel Oops in AES-ECB mode
  KVM: nVMX: invvpid handling improvements
  KVM: VMX: clean up declaration of VPID/EPT invalidation types
  KVM: async_pf: Fix #DF due to inject "Page not Present" and "Page Ready" exceptions simultaneously
  x86/microcode/AMD: Change load_microcode_amd()'s param to bool to fix preemptibility bug
  usb: phy: msm add regulator dependency
  arm64: fix warning about swapper_pg_dir overflow
  idle: i7300: add PCI dependency
  spi: bcm-qspi: shut up warning about cfi header inclusion
  binfmt_elf: compat: avoid unused function warning
  arm64: sunxi: always enable reset controller
  drm/i915: hide unused intel_panel_set_backlight function
  kasan: rework Kconfig settings
  clk: meson: gxbb: fix build error without RESET_CONTROLLER
  ISDN: eicon: reduce stack size of sig_ind function
  tw5864: use dev_warn instead of WARN to shut up warning
  em28xx: only use mt9v011 if camera support is enabled
  go7007: add MEDIA_CAMERA_SUPPORT dependency
  tc358743: fix register i2c_rd/wr functions
  shmem: fix compilation warnings on unused functions
  KVM: add X86_LOCAL_APIC dependency
  Input: tca8418_keypad - hide gcc-4.9 -Wmaybe-uninitialized warning
  drm/nouveau: hide gcc-4.9 -Wmaybe-uninitialized
  rbd: silence bogus -Wmaybe-uninitialized warning
  drm: exynos: mark pm functions as __maybe_unused
  security/keys: BIG_KEY requires CONFIG_CRYPTO
  cw1200: fix bogus maybe-uninitialized warning
  reiserfs: avoid a -Wmaybe-uninitialized warning
  ALSA: hda/ca0132 - fix possible NULL pointer use
  arm64: Kconfig: select COMPAT_BINFMT_ELF only when BINFMT_ELF is set
  scsi: advansys: fix uninitialized data access
  x86/vm86: Fix unused variable warning if THP is disabled
  x86/platform: Add PCI dependency for PUNIT_ATOM_DEBUG
  dmaengine: zx: fix build warning
  x86: add MULTIUSER dependency for KVM
  thermal: fix INTEL_SOC_DTS_IOSF_CORE dependencies
  x86/build: Silence the build with "make -s"
  tools build: Add tools tree support for 'make -s'
  x86/fpu/math-emu: Fix possible uninitialized variable use
  arm64: define BUG() instruction without CONFIG_BUG
  gpio: xgene: mark PM functions as __maybe_unused
  x86/ras/inject: Make it depend on X86_LOCAL_APIC=y
  scsi: advansys: fix build warning for PCI=n
  video: fbdev: via: remove possibly unused variables
  perf: xgene: Include module.h
  PCI: Change pci_host_common_probe() visibility
  usb: musb: fix compilation warning on unused function
  platform/x86: intel_mid_thermal: Fix suspend handlers unused warning
  gpio: intel-mid: Fix build warning when !CONFIG_PM
  PCI: vmd: Fix suspend handlers defined-but-not-used warning
  perf/x86: Shut up false-positive -Wmaybe-uninitialized warning
  vmxnet3: prevent building with 64K pages
  clk: sunxi-ng: fix build error without CONFIG_RESET_CONTROLLER
  shmem: avoid maybe-uninitialized warning
  drm/i915: fix intel_backlight_device_register declaration
  crypto: talitos - fix Kernel Oops on hashing an empty file
  powerpc/64s: Improve RFI L1-D cache flush fallback
  powerpc/64s: Simple RFI macro conversions
  powerpc/64s: Fix conversion of slb_miss_common to use RFI_TO_USER/KERNEL
  hippi: Fix a Fix a possible sleep-in-atomic bug in rr_close
  xen: XEN_ACPI_PROCESSOR is Dom0-only
  platform/x86: dell-laptop: Fix keyboard max lighting for Dell Latitude E6410
  x86/mm/kmmio: Fix mmiotrace for page unaligned addresses
  mm/early_ioremap: Fix boot hang with earlyprintk=efi,keep
  usb: dwc3: of-simple: fix missing clk_disable_unprepare
  usb: dwc3: gadget: Wait longer for controller to end command processing
  dmaengine: jz4740: disable/unprepare clk if probe fails
  drm/armada: fix leak of crtc structure
  xfrm: Fix stack-out-of-bounds with misconfigured transport mode policies.
  spi: sun4i: disable clocks in the remove function
  ASoC: rockchip: disable clock on error
  clk: fix a panic error caused by accessing NULL pointer
  dmaengine: at_hdmac: fix potential NULL pointer dereference in atc_prep_dma_interleaved
  dmaengine: ioat: Fix error handling path
  gianfar: Disable EEE autoneg by default
  509: fix printing uninitialized stack memory when OID is empty
  net: ethernet: arc: fix error handling in emac_rockchip_probe
  brcmfmac: Avoid build error with make W=1
  btrfs: Fix possible off-by-one in btrfs_search_path_in_tree
  net_sched: red: Avoid illegal values
  net_sched: red: Avoid devision by zero
  gianfar: fix a flooded alignment reports because of padding issue.
  ARM: dts: Fix elm interrupt compiler warning
  s390/dasd: prevent prefix I/O error
  powerpc/perf: Fix oops when grouping different pmu events
  m68k: add missing SOFTIRQENTRY_TEXT linker section
  ipvlan: Add the skb->mark as flow4's member to lookup route
  scripts/kernel-doc: Don't fail with status != 0 if error encountered with -none
  sctp: only update outstanding_bytes for transmitted queue when doing prsctp_prune
  RDMA/cma: Make sure that PSN is not over max allowed
  i40iw: Correct ARP index mask
  pinctrl: sunxi: Fix A64 UART mux value
  pinctrl: sunxi: Fix A80 interrupt pin bank
  media: s5k6aa: describe some function parameters
  perf bench numa: Fixup discontiguous/sparse numa nodes
  perf top: Fix window dimensions change handling
  ARM: dts: am437x-cm-t43: Correct the dmas property of spi0
  ARM: dts: am4372: Correct the interrupts_properties of McASP
  ARM: dts: logicpd-somlv: Fix wl127x pinmux
  ARM: dts: logicpd-som-lv: Fix gpmc addresses for NAND and enet
  ARM: dts: Fix omap4 hang with GPS connected to USB by using wakeupgen
  ARM: AM33xx: PRM: Remove am33xx_pwrdm_read_prev_pwrst function
  ARM: OMAP2+: Fix SRAM virt to phys translation for save_secure_ram_context
  usb: build drivers/usb/common/ when USB_SUPPORT is set
  usbip: keep usbip_device sockfd state in sync with tcp_socket
  staging: iio: ad5933: switch buffer mode to software
  staging: iio: adc: ad7192: fix external frequency setting
  binder: check for binder_thread allocation failure in binder_poll()
  staging: android: ashmem: Fix a race condition in pin ioctls
  dn_getsockoptdecnet: move nf_{get/set}sockopt outside sock lock
  arm64: dts: add #cooling-cells to CPU nodes
  ARM: 8743/1: bL_switcher: add MODULE_LICENSE tag
  video: fbdev/mmp: add MODULE_LICENSE
  ASoC: ux500: add MODULE_LICENSE tag
  crypto: hash - prevent using keyed hashes without setting key
  crypto: hash - annotate algorithms taking optional key
  net: avoid skb_warn_bad_offload on IS_ERR
  rds: tcp: atomically purge entries from rds_tcp_conn_list during netns delete
  netfilter: xt_RATEEST: acquire xt_rateest_mutex for hash insert
  netfilter: xt_cgroup: initialize info->priv in cgroup_mt_check_v1()
  netfilter: on sockopt() acquire sock lock only in the required scope
  netfilter: ipt_CLUSTERIP: fix out-of-bounds accesses in clusterip_tg_check()
  netfilter: x_tables: avoid out-of-bounds reads in xt_request_find_{match|target}
  netfilter: x_tables: fix int overflow in xt_alloc_table_info()
  kcov: detect double association with a single task
  KVM: x86: fix escape of guest dr6 to the host
  blk_rq_map_user_iov: fix error override
  staging: android: ion: Switch from WARN to pr_warn
  staging: android: ion: Add __GFP_NOWARN for system contig heap
  crypto: x86/twofish-3way - Fix %rbp usage
  selinux: skip bounded transition processing if the policy isn't loaded
  selinux: ensure the context is NUL terminated in security_context_to_sid_core()
  Provide a function to create a NUL-terminated string from unterminated data
  ptr_ring: fail early if queue occupies more than KMALLOC_MAX_SIZE
  drm: Require __GFP_NOFAIL for the legacy drm_modeset_lock_all
  blktrace: fix unlocked registration of tracepoints
  sctp: set frag_point in sctp_setsockopt_maxseg correctly
  xfrm: check id proto in validate_tmpl()
  xfrm: Fix stack-out-of-bounds read on socket policy lookup.
  mm,vmscan: Make unregister_shrinker() no-op if register_shrinker() failed.
  xfrm: skip policies marked as dead while rehashing
  cfg80211: check dev_set_name() return value
  kcm: Only allow TCP sockets to be attached to a KCM mux
  kcm: Check if sk_user_data already set in kcm_attach
  vhost: use mutex_lock_nested() in vhost_dev_lock_vqs()
  ANDROID: sdcardfs: Hold i_mutex for i_size_write
  UPSTREAM: ANDROID: binder: synchronize_rcu() when using POLLFREE.
  BACKPORT, FROMGIT: crypto: speck - add test vectors for Speck64-XTS
  BACKPORT, FROMGIT: crypto: speck - add test vectors for Speck128-XTS
  BACKPORT, FROMGIT: crypto: arm/speck - add NEON-accelerated implementation of Speck-XTS
  FROMGIT: crypto: speck - export common helpers
  BACKPORT, FROMGIT: crypto: speck - add support for the Speck block cipher
  f2fs: updates on v4.16-rc1
  Linux 4.9.83
  media: r820t: fix r820t_write_reg for KASAN
  ARM: dts: Delete bogus reference to the charlcd
  arm: dts: mt2701: Add reset-cells
  ARM: dts: s5pv210: add interrupt-parent for ohci
  arm64: dts: msm8916: Add missing #phy-cells
  ARM: pxa/tosa-bt: add MODULE_LICENSE tag
  ARM: dts: exynos: fix RTC interrupt for exynos5410
  vfs: don't do RCU lookup of empty pathnames
  x86: fix build warnign with 32-bit PAE
  x86/cpu: Change type of x86_cache_size variable to unsigned int
  x86/spectre: Fix an error message
  x86/cpu: Rename cpu_data.x86_mask to cpu_data.x86_stepping
  selftests/x86/mpx: Fix incorrect bounds with old _sigfault
  x86/speculation: Add <asm/msr-index.h> dependency
  nospec: Move array_index_nospec() parameter checking into separate macro
  x86/speculation: Fix up array_index_nospec_mask() asm constraint
  selftests/x86: Do not rely on "int $0x80" in single_step_syscall.c
  selftests/x86: Do not rely on "int $0x80" in test_mremap_vdso.c
  selftests/x86/pkeys: Remove unused functions
  x86/speculation: Clean up various Spectre related details
  X86/nVMX: Properly set spec_ctrl and pred_cmd before merging MSRs
  KVM/x86: Reduce retpoline performance impact in slot_handle_level_range(), by always inlining iterator helper methods
  x86/speculation: Correct Speculation Control microcode blacklist again
  x86/speculation: Update Speculation Control microcode blacklist
  compiler-gcc.h: Introduce __optimize function attribute
  x86/entry/64/compat: Clear registers for compat syscalls, to reduce speculation attack surface
  arm: spear13xx: Fix spics gpio controller's warning
  arm: spear13xx: Fix dmas cells
  arm: spear600: Add missing interrupt-parent of rtc
  ARM: dts: nomadik: add interrupt-parent for clcd
  ARM: dts: STi: Add gpio polarity for "hdmi,hpd-gpio" property
  ARM: lpc3250: fix uda1380 gpio numbers
  arm64: dts: msm8916: Correct ipc references for smsm
  s390: fix handling of -1 in set{,fs}[gu]id16 syscalls
  ocfs2: try a blocking lock before return AOP_TRUNCATED_PAGE
  PM / devfreq: Propagate error from devfreq_add_device()
  cpufreq: powernv: Dont assume distinct pstate values for nominal and pmin
  RDMA/rxe: Fix a race condition related to the QP error state
  kselftest: fix OOM in memory compaction test
  IB/mlx4: Fix incorrectly releasing steerable UD QPs when have only ETH ports
  IB/qib: Fix comparison error with qperf compare/swap test
  powerpc: fix build errors in stable tree
  dm: correctly handle chained bios in dec_pending()
  usb: Move USB_UHCI_BIG_ENDIAN_* out of USB_SUPPORT
  mvpp2: fix multicast address filter
  ALSA: seq: Fix racy pool initializations
  ALSA: usb-audio: add implicit fb quirk for Behringer UFX1204
  ALSA: hda/realtek: PCI quirk for Fujitsu U7x7
  ALSA: hda/realtek - Enable Thinkpad Dock device for ALC298 platform
  ALSA: usb-audio: Fix UAC2 get_ctl request with a RANGE attribute
  ALSA: hda - Fix headset mic detection problem for two Dell machines
  mtd: nand: vf610: set correct ooblayout
  9p/trans_virtio: discard zero-length reply
  Btrfs: fix unexpected -EEXIST when creating new inode
  Btrfs: fix btrfs_evict_inode to handle abnormal inodes correctly
  Btrfs: fix extent state leak from tree log
  Btrfs: fix crash due to not cleaning up tree log block's dirty bits
  Btrfs: fix deadlock in run_delalloc_nocow
  target/iscsi: avoid NULL dereference in CHAP auth error path
  rtlwifi: rtl8821ae: Fix connection lost problem correctly
  console/dummy: leave .con_font_get set to NULL
  video: fbdev: atmel_lcdfb: fix display-timings lookup
  PCI: keystone: Fix interrupt-controller-node lookup
  MIPS: Fix typo BIG_ENDIAN to CPU_BIG_ENDIAN
  mm: Fix memory size alignment in devm_memremap_pages_release()
  mm: hide a #warning for COMPILE_TEST
  ext4: correct documentation for grpid mount option
  ext4: save error to disk in __ext4_grp_locked_error()
  ext4: fix a race in the ext4 shutdown path
  jbd2: fix sphinx kernel-doc build warnings
  mbcache: initialize entry->e_referenced in mb_cache_entry_create()
  rtc-opal: Fix handling of firmware error codes, prevent busy loops
  drm/radeon: adjust tested variable
  drm/radeon: Add dpm quirk for Jet PRO (v2)
  scsi: smartpqi: allow static build ("built-in")
  BACKPORT: tee: shm: Potential NULL dereference calling tee_shm_register()
  BACKPORT: tee: shm: don't put_page on null shm->pages
  BACKPORT: tee: shm: make function __tee_shm_alloc static
  BACKPORT: tee: optee: check type of registered shared memory
  BACKPORT: tee: add start argument to shm_register callback
  BACKPORT: tee: optee: fix header dependencies
  BACKPORT: tee: shm: inline tee_shm_get_id()
  BACKPORT: tee: use reference counting for tee_context
  BACKPORT: tee: optee: enable dynamic SHM support
  BACKPORT: tee: optee: add optee-specific shared pool implementation
  BACKPORT: tee: optee: store OP-TEE capabilities in private data
  BACKPORT: tee: optee: add registered buffers handling into RPC calls
  BACKPORT: tee: optee: add registered shared parameters handling
  BACKPORT: tee: optee: add shared buffer registration functions
  BACKPORT: tee: optee: add page list manipulation functions
  BACKPORT: tee: optee: Update protocol definitions
  BACKPORT: tee: shm: add page accessor functions
  BACKPORT: tee: shm: add accessors for buffer size and page offset
  BACKPORT: tee: add register user memory
  BACKPORT: tee: flexible shared memory pool creation
  BACKPORT: optee: support asynchronous supplicant requests
  BACKPORT: tee: add TEE_IOCTL_PARAM_ATTR_META
  BACKPORT: tee: add tee_param_is_memref() for driver use
  UPSTREAM: tcp: fix access to sk->sk_state in tcp_poll()
  BACKPORT: tcp: fix potential double free issue for fastopen_req
  BACKPORT: xfrm: Fix return value check of copy_sec_ctx.
  time: Fix ktime_get_raw() incorrect base accumulation
  FROMLIST: coresight: ETM: Add support for ARM Cortex-A73
  FROMLIST: coresight: tmc: implementing TMC-ETR AUX space API
  UPSTREAM: coresight: etm_perf: Fix using uninitialised work
  UPSTREAM: coresight: fix kernel panic caused by invalid CPU
  UPSTREAM: coresight: Fix disabling of CoreSight TPIU
  UPSTREAM: coresight: perf: Add a missing call to etm_free_aux
  UPSTREAM: coresight: tmc: Remove duplicate memset
  UPSTREAM: coresight: tmc: Get rid of mode parameter for helper routines
  UPSTREAM: coresight: tmc: Cleanup operation mode handling
  UPSTREAM: coresight: reset "enable_sink" flag when need be
  sched/fair: prevent possible infinite loop in sched_group_energy
  ANDROID: qtaguid: Fix the UAF probelm with tag_ref_tree
  UPSTREAM: ANDROID: binder: remove waitqueue when thread exits.
  ANDROID: sdcardfs: Protect set_top
  ANDROID: fsnotify: Notify lower fs of open
  Revert "ANDROID: sdcardfs: notify lower file of opens"
  ANDROID: sdcardfs: Use lower getattr times/size
  ANDROID: sched: EAS: check energy_aware() before calling select_energy_cpu_brute() in up-migrate path
  UPSTREAM: eventpoll.h: add missing epoll event masks
  BACKPORT: thermal/drivers/hisi: Add support for hi3660 SoC
  BACKPORT: thermal/drivers/hisi: Prepare to add support for other hisi platforms
  BACKPORT: thermal/drivers/hisi: Add platform prefix to function name
  BACKPORT: thermal/drivers/hisi: Put platform code together
  BACKPORT: thermal/drivers/hisi: Use round up step value
  BACKPORT: thermal/drivers/hisi: Move the clk setup in the corresponding functions
  BACKPORT: thermal/drivers/hisi: Remove mutex_lock in the code
  BACKPORT: thermal/drivers/hisi: Remove thermal data back pointer
  BACKPORT: thermal/drivers/hisi: Convert long to int
  BACKPORT: thermal/drivers/hisi: Rename and remove unused field
  BACKPORT: thermal/drivers/hisi: Remove costly sensor inspection
  BACKPORT: thermal/drivers/hisi: Fix configuration register setting
  BACKPORT: thermal/drivers/hisi: Encapsulate register writes into helpers
  BACKPORT: thermal/drivers/hisi: Remove pointless lock
  BACKPORT: thermal/drivers/hisi: Remove the multiple sensors support
  BACKPORT: thermal: hisilicon: constify thermal_zone_of_device_ops structures
  ANDROID: xattr: Pass EOPNOTSUPP to permission2
  ANDROID: sdcardfs: Move default_normal to superblock
  UPSTREAM: tcp: fix a request socket leak
  UPSTREAM: tcp: fix possible deadlock in TCP stack vs BPF filter
  UPSTREAM: tcp: Add a tcp_filter hook before handle ack packet
  FROMLIST: arm64: kpti: Fix the interaction between ASID switching and software PAN
  FROMLIST: arm64: Move post_ttbr_update_workaround to C code
  fscrypt: updates on 4.15-rc4
  ANDROID: uid_sys_stats: fix the comment
  BACKPORT: optee: fix invalid of_node_put() in optee_driver_init()
  BACKPORT: tee: optee: sync with new naming of interrupts
  BACKPORT: tee: indicate privileged dev in gen_caps
  BACKPORT: tee: optee: interruptible RPC sleep
  BACKPORT: tee: optee: add const to tee_driver_ops and tee_desc structures
  BACKPORT: tee: tee_shm: Constify dma_buf_ops structures.
  BACKPORT: tee: add forward declaration for struct device
  BACKPORT: tee: optee: fix uninitialized symbol 'parg'
  BACKPORT: tee.txt: standardize document format
  BACKPORT: tee: add ARM_SMCCC dependency
  clocksource: arch_timer: make virtual counter access configurable
  arm64: issue isb when trapping CNTVCT_EL0 access
  BACKPORT: arm64: Add CNTFRQ_EL0 trap handler
  BACKPORT: arm64: Add CNTVCT_EL0 trap handler
  ANDROID: sdcardfs: Fix missing break on default_normal
  ANDROID: arm64: kaslr: fixup Falkor workaround for 4.9
  ANDROID: usb: f_fs: Prevent gadget unbind if it is already unbound
  arm64: Kconfig: Reword UNMAP_KERNEL_AT_EL0 kconfig entry
  arm64: use RET instruction for exiting the trampoline
  UPSTREAM: arm64: kaslr: Put kernel vectors address in separate data page
  UPSTREAM: arm64: mm: Introduce TTBR_ASID_MASK for getting at the ASID in the TTBR
  UPSTREAM: arm64: Kconfig: Add CONFIG_UNMAP_KERNEL_AT_EL0
  UPSTREAM: arm64: entry: Add fake CPU feature for unmapping the kernel at EL0
  UPSTREAM: arm64: tls: Avoid unconditional zeroing of tpidrro_el0 for native tasks
  UPSTREAM: arm64: erratum: Work around Falkor erratum #E1003 in trampoline code
  UPSTREAM: arm64: entry: Hook up entry trampoline to exception vectors
  UPSTREAM: arm64: entry: Explicitly pass exception level to kernel_ventry macro
  UPSTREAM: arm64: mm: Map entry trampoline into trampoline and kernel page tables
  UPSTREAM: arm64: entry: Add exception trampoline page for exceptions from EL0
  UPSTREAM: arm64: mm: Invalidate both kernel and user ASIDs when performing TLBI
  UPSTREAM: arm64: mm: Add arm64_kernel_unmapped_at_el0 helper
  UPSTREAM: arm64: mm: Allocate ASIDs in pairs
  UPSTREAM: arm64: mm: Fix and re-enable ARM64_SW_TTBR0_PAN
  UPSTREAM: arm64: mm: Rename post_ttbr0_update_workaround
  UPSTREAM: arm64: mm: Move ASID from TTBR0 to TTBR1
  UPSTREAM: arm64: mm: Temporarily disable ARM64_SW_TTBR0_PAN
  UPSTREAM: arm64: mm: Use non-global mappings for kernel space
  UPSTREAM: arm64: factor out entry stack manipulation
  ANDROID: sdcardfs: Add default_normal option
  ANDROID: sdcardfs: notify lower file of opens
  blkdev: Refactoring block io latency histogram codes
  UPSTREAM: netfilter: conntrack: use power efficient workqueue
  ANDROID: binder: Remove obsolete proc waitqueue.
  UPSTREAM: arm64: setup: introduce kaslr_offset()
  UPSTREAM: kcov: fix comparison callback signature
  UPSTREAM: kcov: support comparison operands collection
  UPSTREAM: kcov: remove pointless current != NULL check
  UPSTREAM: kcov: support compat processes
  UPSTREAM: kcov: simplify interrupt check
  UPSTREAM: kcov: make kcov work properly with KASLR enabled
  UPSTREAM: kcov: add more missing includes
  BACKPORT: irq: Make the irqentry text section unconditional
  UPSTREAM: kasan: make get_wild_bug_type() static
  UPSTREAM: kasan: separate report parts by empty lines
  UPSTREAM: kasan: improve double-free report format
  UPSTREAM: kasan: print page description after stacks
  UPSTREAM: kasan: improve slab object description
  UPSTREAM: kasan: change report header
  UPSTREAM: kasan: simplify address description logic
  UPSTREAM: kasan: change allocation and freeing stack traces headers
  UPSTREAM: kasan: unify report headers
  UPSTREAM: kasan: introduce helper functions for determining bug type
  BACKPORT: kasan: report only the first error by default
  UPSTREAM: kasan: fix races in quarantine_remove_cache()
  UPSTREAM: kasan: resched in quarantine_remove_cache()
  UPSTREAM: kasan, sched/headers: Uninline kasan_enable/disable_current()
  UPSTREAM: kasan: drain quarantine of memcg slab objects
  UPSTREAM: kasan: eliminate long stalls during quarantine reduction
  UPSTREAM: kasan: support panic_on_warn
  ANDROID: dma-buf/sw_sync: Rename active_list to link
  ANDROID: initramfs: call free_initrd() when skipping init
  BACKPORT: Documentation: tee subsystem and op-tee driver
  BACKPORT: tee: add OP-TEE driver
  BACKPORT: tee: generic TEE subsystem
  BACKPORT: dt/bindings: add bindings for optee
  BACKPORT: schedutil: Reset cached freq if it is not in sync with next_freq
  sched: EAS/WALT: Don't take into account of running task's util
  sched: EAS/WALT: take into account of waking task's load
  sched: EAS: upmigrate misfit current task
  sched: avoid pushing tasks to an offline CPU
  sched: Extend active balance to accept 'push_task' argument
  sched: walt: Correct WALT window size initialization
  sched: WALT: account cumulative window demand
  sched: EAS/WALT: finish accounting prior to task_tick
  sched/fair: prevent meaningless active migration
  sched: walt: Leverage existing helper APIs to apply invariance
  UPSTREAM: net: xfrm: allow clearing socket xfrm policies.
  UPSTREAM: time: Clean up CLOCK_MONOTONIC_RAW time handling
  UPSTREAM: arm64: vdso: fix clock_getres for 4GiB-aligned res
  f2fs: updates on 4.15-rc1
  UPSTREAM: android: binder: fix type mismatch warning
  BACKPORT: arm64: Use __pa_symbol for empty_zero_page
  UPSTREAM: arm64: Use __pa_symbol for kernel symbols
  UPSTREAM: mm: Introduce lm_alias
  FROMLIST: binder: fix proc->files use-after-free
  BACKPORT: xfrm: Clear sk_dst_cache when applying per-socket policy.
  sched: WALT: fix potential overflow
  sched: Update task->on_rq when tasks are moving between runqueues
  sched: WALT: fix window mis-alignment
  sched: EAS: kill incorrect nohz idle cpu kick
  sched: EAS: fix incorrect energy delta calculation due to rounding error
  sched: EAS/WALT: use cr_avg instead of prev_runnable_sum
  sched: WALT: fix broken cumulative runnable average accounting
  sched: deadline: WALT: account cumulative runnable avg
  ANDROID: binder: clarify deferred thread work.
  BACKPORT: net/tcp-fastopen: Add new API support
  UPSTREAM: net: Remove __sk_dst_reset() in tcp_v6_connect()
  UPSTREAM: net/tcp-fastopen: refactor cookie check logic
  sched: compute task utilisation with WALT consistently
  FROMLIST: arm64: Avoid aligning normal memory pointers in __memcpy_{to,from}io
  UPSTREAM: security: bpf: replace include of linux/bpf.h with forward declarations
  UPSTREAM: selinux: bpf: Add addtional check for bpf object file receive
  UPSTREAM: selinux: bpf: Add selinux check for eBPF syscall operations
  BACKPORT: security: bpf: Add LSM hooks for bpf object related syscall
  BACKPORT: bpf: Add file mode configuration into bpf maps
  cpufreq: Drop schedfreq governor
  ANDROID: Revert "arm64: move ELF_ET_DYN_BASE to 4GB / 4MB"
  ANDROID: Revert "arm: move ELF_ET_DYN_BASE to 4MB"
  sched: EAS: Fix the condition to distinguish energy before/after
  sched: EAS: update trg_cpu to backup_cpu if no energy saving for target_cpu
  sched/fair: consider task utilization in group_max_util()
  sched/fair: consider task utilization in group_norm_util()
  sched/fair: enforce EAS mode
  sched/fair: ignore backup CPU when not valid
  sched/fair: trace energy_diff for non boosted tasks
  UPSTREAM: sched/fair: Sync task util before slow-path wakeup
  UPSTREAM: sched/core: Add missing update_rq_clock() call in set_user_nice()
  UPSTREAM: sched/core: Add missing update_rq_clock() call for task_hot()
  UPSTREAM: sched/core: Add missing update_rq_clock() in detach_task_cfs_rq()
  UPSTREAM: sched/core: Add missing update_rq_clock() in post_init_entity_util_avg()
  UPSTREAM: sched/fair: Fix task group initialization
  cpufreq/sched: Consider max cpu capacity when choosing frequencies
  cpufreq/sched: Use cpu max freq rather than policy max
  sched/fair: remove erroneous RCU_LOCKDEP_WARN from start_cpu()
  FROMLIST: ALSA: usx2y: Suppress kernel warning at page allocation failures
  FROMLIST: kbuild: clang: fix build failures with sparse check
  Revert "Revert "BACKPORT: efi/libstub/arm64: Set -fpie when building the EFI stub""
  BACKPORT: efi/libstub: Unify command line param parsing
  ANDROID: sched/walt: Fix divide by zero error in cpufreq notifier
  ANDROID: binder: show high watermark of alloc->pages.
  ANDROID: binder: Add thread->process_todo flag.
  ANDROID: sched/fair: Select correct capacity state for energy_diff
  ANDROID: cpufreq-dt: Set sane defaults for schedutil rate limits
  BACKPORT: cpufreq: schedutil: Use policy-dependent transition delays
  Revert "BACKPORT: efi/libstub/arm64: Set -fpie when building the EFI stub"
  FROMLIST: android: binder: Fix null ptr dereference in debug msg
  FROMLIST: android: binder: Change binder_shrinker to static
  UPSTREAM: arm64: compat: Remove leftover variable declaration
  ANDROID: HACK: arm64: use -mno-implicit-float instead of -mgeneral-regs-only
  ANDROID: Kbuild, LLVMLinux: allow overriding clang target triple
  CHROMIUM: arm64: Disable asm-operand-width warning for clang
  CHROMIUM: kbuild: clang: Disable the 'duplicate-decl-specifier' warning
  BACKPORT: x86/asm: Fix inline asm call constraints for Clang
  BACKPORT: efi/libstub/arm64: Set -fpie when building the EFI stub
  UPSTREAM: efi/libstub/arm64: Force 'hidden' visibility for section markers
  UPSTREAM: efi/libstub/arm64: Use hidden attribute for struct screen_info reference
  UPSTREAM: x86/build: Use cc-option to validate stack alignment parameter
  UPSTREAM: x86/build: Fix stack alignment for CLang
  UPSTREAM: compiler, clang: always inline when CONFIG_OPTIMIZE_INLINING is disabled
  UPSTREAM: x86/boot: #undef memcpy() et al in string.c
  UPSTREAM: llist: clang: introduce member_address_is_nonnull()
  UPSTREAM: crypto: arm64/sha - avoid non-standard inline asm tricks
  UPSTREAM: kbuild: clang: Disable 'address-of-packed-member' warning
  UPSTREAM: x86/build: Specify stack alignment for clang
  UPSTREAM: x86/build: Use __cc-option for boot code compiler options
  UPSTREAM: kbuild: Add __cc-option macro
  UPSTREAM: x86/mm/kaslr: Use the _ASM_MUL macro for multiplication to work around Clang incompatibility
  UPSTREAM: crypto, x86: aesni - fix token pasting for clang
  UPSTREAM: x86/kbuild: Use cc-option to enable -falign-{jumps/loops}
  UPSTREAM: compiler, clang: properly override 'inline' for clang
  UPSTREAM: compiler, clang: suppress warning for unused static inline functions
  UPSTREAM: modules: mark __inittest/__exittest as __maybe_unused
  UPSTREAM: kbuild: Add support to generate LLVM assembly files
  UPSTREAM: kbuild: use -Oz instead of -Os when using clang
  UPSTREAM: kbuild, LLVMLinux: Add -Werror to cc-option to support clang
  UPSTREAM: kbuild: drop -Wno-unknown-warning-option from clang options
  UPSTREAM: kbuild: fix asm-offset generation to work with clang
  UPSTREAM: kbuild: consolidate redundant sed script ASM offset generation
  UPSTREAM: kbuild: Consolidate header generation from ASM offset information
  UPSTREAM: kbuild: clang: add -no-integrated-as to KBUILD_[AC]FLAGS
  UPSTREAM: kbuild: Add better clang cross build support
  FROMLIST: f2fs: expose some sectors to user in inline data or dentry case
  UPSTREAM: sched/fair: Fix usage of find_idlest_group() when the local group is idlest
  UPSTREAM: sched/fair: Fix usage of find_idlest_group() when no groups are allowed
  UPSTREAM: sched/fair: Fix find_idlest_group() when local group is not allowed
  UPSTREAM: sched/fair: Remove unnecessary comparison with -1
  UPSTREAM: sched/fair: Move select_task_rq_fair() slow-path into its own function
  UPSTREAM: sched/fair: Force balancing on NOHZ balance if local group has capacity
  UPSTREAM: f2fs: fix potential panic during fstrim
  f2fs: catch up to v4.14-rc1
  UPSTREAM: sched: use load_avg for selecting idlest group
  UPSTREAM: sched: fix find_idlest_group for fork
  ANDROID: binder: fix node sched policy calculation
  ANDROID: binder: init desired_prio.sched_policy before use it
  BACKPORT: net: xfrm: support setting an output mark.
  FROMLIST: tracing: Add support for preempt and irq enable/disable events
  FROMLIST: tracing: Prepare to add preempt and irq trace events

Conflicts:
	arch/arm64/Kconfig
	arch/arm64/include/asm/assembler.h
	arch/arm64/include/asm/cpucaps.h
	arch/arm64/include/asm/efi.h
	arch/arm64/include/asm/memory.h
	arch/arm64/include/asm/mmu.h
	arch/arm64/include/asm/mmu_context.h
	arch/arm64/kernel/cpufeature.c
	arch/arm64/kernel/io.c
	arch/arm64/kernel/setup.c
	arch/arm64/kernel/vdso.c
	arch/arm64/mm/context.c
	arch/arm64/mm/mmu.c
	drivers/Kconfig
	drivers/Makefile
	drivers/cpufreq/Kconfig
	drivers/hwtracing/coresight/coresight-etm4x.c
	drivers/hwtracing/coresight/coresight-priv.h
	drivers/hwtracing/coresight/coresight-tmc-etr.c
	drivers/hwtracing/coresight/coresight.c
	drivers/scsi/ufs/ufshcd.h
	drivers/staging/android/ion/ion-ioctl.c
	drivers/staging/android/ion/ion_system_heap.c
	drivers/usb/dwc3/gadget.c
	include/linux/sched.h
	include/trace/events/sched.h
	kernel/kcov.c
	kernel/sched/core.c
	kernel/sched/cpufreq_sched.c
	kernel/sched/cpufreq_schedutil.c
	kernel/sched/fair.c
	kernel/sched/sched.h
	kernel/sched/walt.c
	kernel/sched/walt.h
	mm/kasan/report.c
	security/security.c
	security/selinux/hooks.c

Change-Id: I0ec8cbca6cb6384e22fbbe8def8a9d228229dc48
Signed-off-by: Blagovest Kolenichev <bkolenichev@codeaurora.org>
2018-03-19 04:18:31 -07:00

2829 lines
64 KiB
C

/*
* Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
* Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
#include "dm-core.h"
#include "dm-rq.h"
#include "dm-uevent.h"
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/blkpg.h>
#include <linux/bio.h>
#include <linux/mempool.h>
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/hdreg.h>
#include <linux/delay.h>
#include <linux/wait.h>
#include <linux/pr.h>
#include <linux/vmalloc.h>
#define DM_MSG_PREFIX "core"
#ifdef CONFIG_PRINTK
/*
* ratelimit state to be used in DMXXX_LIMIT().
*/
DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
EXPORT_SYMBOL(dm_ratelimit_state);
#endif
/*
* Cookies are numeric values sent with CHANGE and REMOVE
* uevents while resuming, removing or renaming the device.
*/
#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
#define DM_COOKIE_LENGTH 24
static const char *_name = DM_NAME;
static unsigned int major = 0;
static unsigned int _major = 0;
static DEFINE_IDR(_minor_idr);
static DEFINE_SPINLOCK(_minor_lock);
static void do_deferred_remove(struct work_struct *w);
static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
static struct workqueue_struct *deferred_remove_workqueue;
/*
* One of these is allocated per bio.
*/
struct dm_io {
struct mapped_device *md;
int error;
atomic_t io_count;
struct bio *bio;
unsigned long start_time;
spinlock_t endio_lock;
struct dm_stats_aux stats_aux;
};
union map_info *dm_get_rq_mapinfo(struct request *rq)
{
if (rq && rq->end_io_data)
return &((struct dm_rq_target_io *)rq->end_io_data)->info;
return NULL;
}
#define MINOR_ALLOCED ((void *)-1)
/*
* Bits for the md->flags field.
*/
#define DMF_BLOCK_IO_FOR_SUSPEND 0
#define DMF_SUSPENDED 1
#define DMF_FROZEN 2
#define DMF_FREEING 3
#define DMF_DELETING 4
#define DMF_NOFLUSH_SUSPENDING 5
#define DMF_DEFERRED_REMOVE 6
#define DMF_SUSPENDED_INTERNALLY 7
#define DM_NUMA_NODE NUMA_NO_NODE
static int dm_numa_node = DM_NUMA_NODE;
/*
* For mempools pre-allocation at the table loading time.
*/
struct dm_md_mempools {
mempool_t *io_pool;
mempool_t *rq_pool;
struct bio_set *bs;
};
struct table_device {
struct list_head list;
atomic_t count;
struct dm_dev dm_dev;
};
static struct kmem_cache *_io_cache;
static struct kmem_cache *_rq_tio_cache;
static struct kmem_cache *_rq_cache;
/*
* Bio-based DM's mempools' reserved IOs set by the user.
*/
#define RESERVED_BIO_BASED_IOS 16
static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
static int __dm_get_module_param_int(int *module_param, int min, int max)
{
int param = ACCESS_ONCE(*module_param);
int modified_param = 0;
bool modified = true;
if (param < min)
modified_param = min;
else if (param > max)
modified_param = max;
else
modified = false;
if (modified) {
(void)cmpxchg(module_param, param, modified_param);
param = modified_param;
}
return param;
}
unsigned __dm_get_module_param(unsigned *module_param,
unsigned def, unsigned max)
{
unsigned param = ACCESS_ONCE(*module_param);
unsigned modified_param = 0;
if (!param)
modified_param = def;
else if (param > max)
modified_param = max;
if (modified_param) {
(void)cmpxchg(module_param, param, modified_param);
param = modified_param;
}
return param;
}
unsigned dm_get_reserved_bio_based_ios(void)
{
return __dm_get_module_param(&reserved_bio_based_ios,
RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
}
EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
static unsigned dm_get_numa_node(void)
{
return __dm_get_module_param_int(&dm_numa_node,
DM_NUMA_NODE, num_online_nodes() - 1);
}
static int __init local_init(void)
{
int r = -ENOMEM;
/* allocate a slab for the dm_ios */
_io_cache = KMEM_CACHE(dm_io, 0);
if (!_io_cache)
return r;
_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
if (!_rq_tio_cache)
goto out_free_io_cache;
_rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
__alignof__(struct request), 0, NULL);
if (!_rq_cache)
goto out_free_rq_tio_cache;
r = dm_uevent_init();
if (r)
goto out_free_rq_cache;
deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
if (!deferred_remove_workqueue) {
r = -ENOMEM;
goto out_uevent_exit;
}
_major = major;
r = register_blkdev(_major, _name);
if (r < 0)
goto out_free_workqueue;
if (!_major)
_major = r;
return 0;
out_free_workqueue:
destroy_workqueue(deferred_remove_workqueue);
out_uevent_exit:
dm_uevent_exit();
out_free_rq_cache:
kmem_cache_destroy(_rq_cache);
out_free_rq_tio_cache:
kmem_cache_destroy(_rq_tio_cache);
out_free_io_cache:
kmem_cache_destroy(_io_cache);
return r;
}
static void local_exit(void)
{
flush_scheduled_work();
destroy_workqueue(deferred_remove_workqueue);
kmem_cache_destroy(_rq_cache);
kmem_cache_destroy(_rq_tio_cache);
kmem_cache_destroy(_io_cache);
unregister_blkdev(_major, _name);
dm_uevent_exit();
_major = 0;
DMINFO("cleaned up");
}
static int (*_inits[])(void) __initdata = {
local_init,
dm_target_init,
dm_linear_init,
dm_stripe_init,
dm_io_init,
dm_kcopyd_init,
dm_interface_init,
dm_statistics_init,
};
static void (*_exits[])(void) = {
local_exit,
dm_target_exit,
dm_linear_exit,
dm_stripe_exit,
dm_io_exit,
dm_kcopyd_exit,
dm_interface_exit,
dm_statistics_exit,
};
static int __init dm_init(void)
{
const int count = ARRAY_SIZE(_inits);
int r, i;
for (i = 0; i < count; i++) {
r = _inits[i]();
if (r)
goto bad;
}
return 0;
bad:
while (i--)
_exits[i]();
return r;
}
static void __exit dm_exit(void)
{
int i = ARRAY_SIZE(_exits);
while (i--)
_exits[i]();
/*
* Should be empty by this point.
*/
idr_destroy(&_minor_idr);
}
/*
* Block device functions
*/
int dm_deleting_md(struct mapped_device *md)
{
return test_bit(DMF_DELETING, &md->flags);
}
static int dm_blk_open(struct block_device *bdev, fmode_t mode)
{
struct mapped_device *md;
spin_lock(&_minor_lock);
md = bdev->bd_disk->private_data;
if (!md)
goto out;
if (test_bit(DMF_FREEING, &md->flags) ||
dm_deleting_md(md)) {
md = NULL;
goto out;
}
dm_get(md);
atomic_inc(&md->open_count);
out:
spin_unlock(&_minor_lock);
return md ? 0 : -ENXIO;
}
static void dm_blk_close(struct gendisk *disk, fmode_t mode)
{
struct mapped_device *md;
spin_lock(&_minor_lock);
md = disk->private_data;
if (WARN_ON(!md))
goto out;
if (atomic_dec_and_test(&md->open_count) &&
(test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
queue_work(deferred_remove_workqueue, &deferred_remove_work);
dm_put(md);
out:
spin_unlock(&_minor_lock);
}
int dm_open_count(struct mapped_device *md)
{
return atomic_read(&md->open_count);
}
/*
* Guarantees nothing is using the device before it's deleted.
*/
int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
{
int r = 0;
spin_lock(&_minor_lock);
if (dm_open_count(md)) {
r = -EBUSY;
if (mark_deferred)
set_bit(DMF_DEFERRED_REMOVE, &md->flags);
} else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
r = -EEXIST;
else
set_bit(DMF_DELETING, &md->flags);
spin_unlock(&_minor_lock);
return r;
}
int dm_cancel_deferred_remove(struct mapped_device *md)
{
int r = 0;
spin_lock(&_minor_lock);
if (test_bit(DMF_DELETING, &md->flags))
r = -EBUSY;
else
clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
spin_unlock(&_minor_lock);
return r;
}
static void do_deferred_remove(struct work_struct *w)
{
dm_deferred_remove();
}
sector_t dm_get_size(struct mapped_device *md)
{
return get_capacity(md->disk);
}
struct request_queue *dm_get_md_queue(struct mapped_device *md)
{
return md->queue;
}
struct dm_stats *dm_get_stats(struct mapped_device *md)
{
return &md->stats;
}
static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
struct mapped_device *md = bdev->bd_disk->private_data;
return dm_get_geometry(md, geo);
}
static int dm_grab_bdev_for_ioctl(struct mapped_device *md,
struct block_device **bdev,
fmode_t *mode)
{
struct dm_target *tgt;
struct dm_table *map;
int srcu_idx, r;
retry:
r = -ENOTTY;
map = dm_get_live_table(md, &srcu_idx);
if (!map || !dm_table_get_size(map))
goto out;
/* We only support devices that have a single target */
if (dm_table_get_num_targets(map) != 1)
goto out;
tgt = dm_table_get_target(map, 0);
if (!tgt->type->prepare_ioctl)
goto out;
if (dm_suspended_md(md)) {
r = -EAGAIN;
goto out;
}
r = tgt->type->prepare_ioctl(tgt, bdev, mode);
if (r < 0)
goto out;
bdgrab(*bdev);
dm_put_live_table(md, srcu_idx);
return r;
out:
dm_put_live_table(md, srcu_idx);
if (r == -ENOTCONN && !fatal_signal_pending(current)) {
msleep(10);
goto retry;
}
return r;
}
static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct mapped_device *md = bdev->bd_disk->private_data;
int r;
r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
if (r < 0)
return r;
if (r > 0) {
/*
* Target determined this ioctl is being issued against
* a logical partition of the parent bdev; so extra
* validation is needed.
*/
r = scsi_verify_blk_ioctl(NULL, cmd);
if (r)
goto out;
}
r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
out:
bdput(bdev);
return r;
}
static struct dm_io *alloc_io(struct mapped_device *md)
{
return mempool_alloc(md->io_pool, GFP_NOIO);
}
static void free_io(struct mapped_device *md, struct dm_io *io)
{
mempool_free(io, md->io_pool);
}
static void free_tio(struct dm_target_io *tio)
{
bio_put(&tio->clone);
}
int md_in_flight(struct mapped_device *md)
{
return atomic_read(&md->pending[READ]) +
atomic_read(&md->pending[WRITE]);
}
static void start_io_acct(struct dm_io *io)
{
struct mapped_device *md = io->md;
struct bio *bio = io->bio;
int cpu;
int rw = bio_data_dir(bio);
io->start_time = jiffies;
cpu = part_stat_lock();
part_round_stats(cpu, &dm_disk(md)->part0);
part_stat_unlock();
atomic_set(&dm_disk(md)->part0.in_flight[rw],
atomic_inc_return(&md->pending[rw]));
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio_data_dir(bio),
bio->bi_iter.bi_sector, bio_sectors(bio),
false, 0, &io->stats_aux);
}
static void end_io_acct(struct dm_io *io)
{
struct mapped_device *md = io->md;
struct bio *bio = io->bio;
unsigned long duration = jiffies - io->start_time;
int pending;
int rw = bio_data_dir(bio);
generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio_data_dir(bio),
bio->bi_iter.bi_sector, bio_sectors(bio),
true, duration, &io->stats_aux);
/*
* After this is decremented the bio must not be touched if it is
* a flush.
*/
pending = atomic_dec_return(&md->pending[rw]);
atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
pending += atomic_read(&md->pending[rw^0x1]);
/* nudge anyone waiting on suspend queue */
if (!pending)
wake_up(&md->wait);
}
/*
* Add the bio to the list of deferred io.
*/
static void queue_io(struct mapped_device *md, struct bio *bio)
{
unsigned long flags;
spin_lock_irqsave(&md->deferred_lock, flags);
bio_list_add(&md->deferred, bio);
spin_unlock_irqrestore(&md->deferred_lock, flags);
queue_work(md->wq, &md->work);
}
/*
* Everyone (including functions in this file), should use this
* function to access the md->map field, and make sure they call
* dm_put_live_table() when finished.
*/
struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
{
*srcu_idx = srcu_read_lock(&md->io_barrier);
return srcu_dereference(md->map, &md->io_barrier);
}
void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
{
srcu_read_unlock(&md->io_barrier, srcu_idx);
}
void dm_sync_table(struct mapped_device *md)
{
synchronize_srcu(&md->io_barrier);
synchronize_rcu_expedited();
}
/*
* A fast alternative to dm_get_live_table/dm_put_live_table.
* The caller must not block between these two functions.
*/
static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
{
rcu_read_lock();
return rcu_dereference(md->map);
}
static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
{
rcu_read_unlock();
}
/*
* Open a table device so we can use it as a map destination.
*/
static int open_table_device(struct table_device *td, dev_t dev,
struct mapped_device *md)
{
static char *_claim_ptr = "I belong to device-mapper";
struct block_device *bdev;
int r;
BUG_ON(td->dm_dev.bdev);
bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
r = bd_link_disk_holder(bdev, dm_disk(md));
if (r) {
blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
return r;
}
td->dm_dev.bdev = bdev;
return 0;
}
/*
* Close a table device that we've been using.
*/
static void close_table_device(struct table_device *td, struct mapped_device *md)
{
if (!td->dm_dev.bdev)
return;
bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
td->dm_dev.bdev = NULL;
}
static struct table_device *find_table_device(struct list_head *l, dev_t dev,
fmode_t mode) {
struct table_device *td;
list_for_each_entry(td, l, list)
if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
return td;
return NULL;
}
int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
struct dm_dev **result) {
int r;
struct table_device *td;
mutex_lock(&md->table_devices_lock);
td = find_table_device(&md->table_devices, dev, mode);
if (!td) {
td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
if (!td) {
mutex_unlock(&md->table_devices_lock);
return -ENOMEM;
}
td->dm_dev.mode = mode;
td->dm_dev.bdev = NULL;
if ((r = open_table_device(td, dev, md))) {
mutex_unlock(&md->table_devices_lock);
kfree(td);
return r;
}
format_dev_t(td->dm_dev.name, dev);
atomic_set(&td->count, 0);
list_add(&td->list, &md->table_devices);
}
atomic_inc(&td->count);
mutex_unlock(&md->table_devices_lock);
*result = &td->dm_dev;
return 0;
}
EXPORT_SYMBOL_GPL(dm_get_table_device);
void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
{
struct table_device *td = container_of(d, struct table_device, dm_dev);
mutex_lock(&md->table_devices_lock);
if (atomic_dec_and_test(&td->count)) {
close_table_device(td, md);
list_del(&td->list);
kfree(td);
}
mutex_unlock(&md->table_devices_lock);
}
EXPORT_SYMBOL(dm_put_table_device);
static void free_table_devices(struct list_head *devices)
{
struct list_head *tmp, *next;
list_for_each_safe(tmp, next, devices) {
struct table_device *td = list_entry(tmp, struct table_device, list);
DMWARN("dm_destroy: %s still exists with %d references",
td->dm_dev.name, atomic_read(&td->count));
kfree(td);
}
}
/*
* Get the geometry associated with a dm device
*/
int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
{
*geo = md->geometry;
return 0;
}
/*
* Set the geometry of a device.
*/
int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
{
sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
if (geo->start > sz) {
DMWARN("Start sector is beyond the geometry limits.");
return -EINVAL;
}
md->geometry = *geo;
return 0;
}
/*-----------------------------------------------------------------
* CRUD START:
* A more elegant soln is in the works that uses the queue
* merge fn, unfortunately there are a couple of changes to
* the block layer that I want to make for this. So in the
* interests of getting something for people to use I give
* you this clearly demarcated crap.
*---------------------------------------------------------------*/
static int __noflush_suspending(struct mapped_device *md)
{
return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
}
/*
* Decrements the number of outstanding ios that a bio has been
* cloned into, completing the original io if necc.
*/
static void dec_pending(struct dm_io *io, int error)
{
unsigned long flags;
int io_error;
struct bio *bio;
struct mapped_device *md = io->md;
/* Push-back supersedes any I/O errors */
if (unlikely(error)) {
spin_lock_irqsave(&io->endio_lock, flags);
if (!(io->error > 0 && __noflush_suspending(md)))
io->error = error;
spin_unlock_irqrestore(&io->endio_lock, flags);
}
if (atomic_dec_and_test(&io->io_count)) {
if (io->error == DM_ENDIO_REQUEUE) {
/*
* Target requested pushing back the I/O.
*/
spin_lock_irqsave(&md->deferred_lock, flags);
if (__noflush_suspending(md))
bio_list_add_head(&md->deferred, io->bio);
else
/* noflush suspend was interrupted. */
io->error = -EIO;
spin_unlock_irqrestore(&md->deferred_lock, flags);
}
io_error = io->error;
bio = io->bio;
end_io_acct(io);
free_io(md, io);
if (io_error == DM_ENDIO_REQUEUE)
return;
if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
/*
* Preflush done for flush with data, reissue
* without REQ_PREFLUSH.
*/
bio->bi_opf &= ~REQ_PREFLUSH;
queue_io(md, bio);
} else {
/* done with normal IO or empty flush */
trace_block_bio_complete(md->queue, bio, io_error);
if (io_error)
bio->bi_error = io_error;
bio_endio(bio);
}
}
}
void disable_write_same(struct mapped_device *md)
{
struct queue_limits *limits = dm_get_queue_limits(md);
/* device doesn't really support WRITE SAME, disable it */
limits->max_write_same_sectors = 0;
}
static void clone_endio(struct bio *bio)
{
int error = bio->bi_error;
int r = error;
struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
struct dm_io *io = tio->io;
struct mapped_device *md = tio->io->md;
dm_endio_fn endio = tio->ti->type->end_io;
if (endio) {
r = endio(tio->ti, bio, error);
if (r < 0 || r == DM_ENDIO_REQUEUE)
/*
* error and requeue request are handled
* in dec_pending().
*/
error = r;
else if (r == DM_ENDIO_INCOMPLETE)
/* The target will handle the io */
return;
else if (r) {
DMWARN("unimplemented target endio return value: %d", r);
BUG();
}
}
if (unlikely(r == -EREMOTEIO && (bio_op(bio) == REQ_OP_WRITE_SAME) &&
!bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
disable_write_same(md);
free_tio(tio);
dec_pending(io, error);
}
/*
* Return maximum size of I/O possible at the supplied sector up to the current
* target boundary.
*/
static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
{
sector_t target_offset = dm_target_offset(ti, sector);
return ti->len - target_offset;
}
static sector_t max_io_len(sector_t sector, struct dm_target *ti)
{
sector_t len = max_io_len_target_boundary(sector, ti);
sector_t offset, max_len;
/*
* Does the target need to split even further?
*/
if (ti->max_io_len) {
offset = dm_target_offset(ti, sector);
if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
max_len = sector_div(offset, ti->max_io_len);
else
max_len = offset & (ti->max_io_len - 1);
max_len = ti->max_io_len - max_len;
if (len > max_len)
len = max_len;
}
return len;
}
int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
{
if (len > UINT_MAX) {
DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
(unsigned long long)len, UINT_MAX);
ti->error = "Maximum size of target IO is too large";
return -EINVAL;
}
ti->max_io_len = (uint32_t) len;
return 0;
}
EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
static long dm_blk_direct_access(struct block_device *bdev, sector_t sector,
void **kaddr, pfn_t *pfn, long size)
{
struct mapped_device *md = bdev->bd_disk->private_data;
struct dm_table *map;
struct dm_target *ti;
int srcu_idx;
long len, ret = -EIO;
map = dm_get_live_table(md, &srcu_idx);
if (!map)
goto out;
ti = dm_table_find_target(map, sector);
if (!dm_target_is_valid(ti))
goto out;
len = max_io_len(sector, ti) << SECTOR_SHIFT;
size = min(len, size);
if (ti->type->direct_access)
ret = ti->type->direct_access(ti, sector, kaddr, pfn, size);
out:
dm_put_live_table(md, srcu_idx);
return min(ret, size);
}
/*
* A target may call dm_accept_partial_bio only from the map routine. It is
* allowed for all bio types except REQ_PREFLUSH.
*
* dm_accept_partial_bio informs the dm that the target only wants to process
* additional n_sectors sectors of the bio and the rest of the data should be
* sent in a next bio.
*
* A diagram that explains the arithmetics:
* +--------------------+---------------+-------+
* | 1 | 2 | 3 |
* +--------------------+---------------+-------+
*
* <-------------- *tio->len_ptr --------------->
* <------- bi_size ------->
* <-- n_sectors -->
*
* Region 1 was already iterated over with bio_advance or similar function.
* (it may be empty if the target doesn't use bio_advance)
* Region 2 is the remaining bio size that the target wants to process.
* (it may be empty if region 1 is non-empty, although there is no reason
* to make it empty)
* The target requires that region 3 is to be sent in the next bio.
*
* If the target wants to receive multiple copies of the bio (via num_*bios, etc),
* the partially processed part (the sum of regions 1+2) must be the same for all
* copies of the bio.
*/
void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
{
struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
BUG_ON(bio->bi_opf & REQ_PREFLUSH);
BUG_ON(bi_size > *tio->len_ptr);
BUG_ON(n_sectors > bi_size);
*tio->len_ptr -= bi_size - n_sectors;
bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
}
EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
/*
* Flush current->bio_list when the target map method blocks.
* This fixes deadlocks in snapshot and possibly in other targets.
*/
struct dm_offload {
struct blk_plug plug;
struct blk_plug_cb cb;
};
static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
{
struct dm_offload *o = container_of(cb, struct dm_offload, cb);
struct bio_list list;
struct bio *bio;
int i;
INIT_LIST_HEAD(&o->cb.list);
if (unlikely(!current->bio_list))
return;
for (i = 0; i < 2; i++) {
list = current->bio_list[i];
bio_list_init(&current->bio_list[i]);
while ((bio = bio_list_pop(&list))) {
struct bio_set *bs = bio->bi_pool;
if (unlikely(!bs) || bs == fs_bio_set) {
bio_list_add(&current->bio_list[i], bio);
continue;
}
spin_lock(&bs->rescue_lock);
bio_list_add(&bs->rescue_list, bio);
queue_work(bs->rescue_workqueue, &bs->rescue_work);
spin_unlock(&bs->rescue_lock);
}
}
}
static void dm_offload_start(struct dm_offload *o)
{
blk_start_plug(&o->plug);
o->cb.callback = flush_current_bio_list;
list_add(&o->cb.list, &current->plug->cb_list);
}
static void dm_offload_end(struct dm_offload *o)
{
list_del(&o->cb.list);
blk_finish_plug(&o->plug);
}
static void __map_bio(struct dm_target_io *tio)
{
int r;
sector_t sector;
struct dm_offload o;
struct bio *clone = &tio->clone;
struct dm_target *ti = tio->ti;
clone->bi_end_io = clone_endio;
/*
* Map the clone. If r == 0 we don't need to do
* anything, the target has assumed ownership of
* this io.
*/
atomic_inc(&tio->io->io_count);
sector = clone->bi_iter.bi_sector;
dm_offload_start(&o);
r = ti->type->map(ti, clone);
dm_offload_end(&o);
if (r == DM_MAPIO_REMAPPED) {
/* the bio has been remapped so dispatch it */
trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
tio->io->bio->bi_bdev->bd_dev, sector);
generic_make_request(clone);
} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
/* error the io and bail out, or requeue it if needed */
dec_pending(tio->io, r);
free_tio(tio);
} else if (r != DM_MAPIO_SUBMITTED) {
DMWARN("unimplemented target map return value: %d", r);
BUG();
}
}
struct clone_info {
struct mapped_device *md;
struct dm_table *map;
struct bio *bio;
struct dm_io *io;
sector_t sector;
unsigned sector_count;
};
static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
{
bio->bi_iter.bi_sector = sector;
bio->bi_iter.bi_size = to_bytes(len);
}
/*
* Creates a bio that consists of range of complete bvecs.
*/
static int clone_bio(struct dm_target_io *tio, struct bio *bio,
sector_t sector, unsigned len)
{
struct bio *clone = &tio->clone;
__bio_clone_fast(clone, bio);
if (bio_integrity(bio)) {
int r = bio_integrity_clone(clone, bio, GFP_NOIO);
if (r < 0)
return r;
}
bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
clone->bi_iter.bi_size = to_bytes(len);
if (bio_integrity(bio))
bio_integrity_trim(clone, 0, len);
return 0;
}
static struct dm_target_io *alloc_tio(struct clone_info *ci,
struct dm_target *ti,
unsigned target_bio_nr)
{
struct dm_target_io *tio;
struct bio *clone;
clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
tio = container_of(clone, struct dm_target_io, clone);
tio->io = ci->io;
tio->ti = ti;
tio->target_bio_nr = target_bio_nr;
return tio;
}
static void __clone_and_map_simple_bio(struct clone_info *ci,
struct dm_target *ti,
unsigned target_bio_nr, unsigned *len)
{
struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr);
struct bio *clone = &tio->clone;
tio->len_ptr = len;
__bio_clone_fast(clone, ci->bio);
if (len)
bio_setup_sector(clone, ci->sector, *len);
__map_bio(tio);
}
static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
unsigned num_bios, unsigned *len)
{
unsigned target_bio_nr;
for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
__clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
}
static int __send_empty_flush(struct clone_info *ci)
{
unsigned target_nr = 0;
struct dm_target *ti;
BUG_ON(bio_has_data(ci->bio));
while ((ti = dm_table_get_target(ci->map, target_nr++)))
__send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
return 0;
}
static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
sector_t sector, unsigned *len)
{
struct bio *bio = ci->bio;
struct dm_target_io *tio;
unsigned target_bio_nr;
unsigned num_target_bios = 1;
int r = 0;
/*
* Does the target want to receive duplicate copies of the bio?
*/
if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
num_target_bios = ti->num_write_bios(ti, bio);
for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
tio = alloc_tio(ci, ti, target_bio_nr);
tio->len_ptr = len;
r = clone_bio(tio, bio, sector, *len);
if (r < 0) {
free_tio(tio);
break;
}
__map_bio(tio);
}
return r;
}
typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
static unsigned get_num_discard_bios(struct dm_target *ti)
{
return ti->num_discard_bios;
}
static unsigned get_num_write_same_bios(struct dm_target *ti)
{
return ti->num_write_same_bios;
}
typedef bool (*is_split_required_fn)(struct dm_target *ti);
static bool is_split_required_for_discard(struct dm_target *ti)
{
return ti->split_discard_bios;
}
static int __send_changing_extent_only(struct clone_info *ci,
get_num_bios_fn get_num_bios,
is_split_required_fn is_split_required)
{
struct dm_target *ti;
unsigned len;
unsigned num_bios;
do {
ti = dm_table_find_target(ci->map, ci->sector);
if (!dm_target_is_valid(ti))
return -EIO;
/*
* Even though the device advertised support for this type of
* request, that does not mean every target supports it, and
* reconfiguration might also have changed that since the
* check was performed.
*/
num_bios = get_num_bios ? get_num_bios(ti) : 0;
if (!num_bios)
return -EOPNOTSUPP;
if (is_split_required && !is_split_required(ti))
len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
else
len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
__send_duplicate_bios(ci, ti, num_bios, &len);
ci->sector += len;
} while (ci->sector_count -= len);
return 0;
}
static int __send_discard(struct clone_info *ci)
{
return __send_changing_extent_only(ci, get_num_discard_bios,
is_split_required_for_discard);
}
static int __send_write_same(struct clone_info *ci)
{
return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
}
/*
* Select the correct strategy for processing a non-flush bio.
*/
static int __split_and_process_non_flush(struct clone_info *ci)
{
struct bio *bio = ci->bio;
struct dm_target *ti;
unsigned len;
int r;
if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
return __send_discard(ci);
else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
return __send_write_same(ci);
ti = dm_table_find_target(ci->map, ci->sector);
if (!dm_target_is_valid(ti))
return -EIO;
len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
if (r < 0)
return r;
ci->sector += len;
ci->sector_count -= len;
return 0;
}
/*
* Entry point to split a bio into clones and submit them to the targets.
*/
static void __split_and_process_bio(struct mapped_device *md,
struct dm_table *map, struct bio *bio)
{
struct clone_info ci;
int error = 0;
if (unlikely(!map)) {
bio_io_error(bio);
return;
}
ci.map = map;
ci.md = md;
ci.io = alloc_io(md);
ci.io->error = 0;
atomic_set(&ci.io->io_count, 1);
ci.io->bio = bio;
ci.io->md = md;
spin_lock_init(&ci.io->endio_lock);
ci.sector = bio->bi_iter.bi_sector;
start_io_acct(ci.io);
if (bio->bi_opf & REQ_PREFLUSH) {
ci.bio = &ci.md->flush_bio;
ci.sector_count = 0;
error = __send_empty_flush(&ci);
/* dec_pending submits any data associated with flush */
} else {
ci.bio = bio;
ci.sector_count = bio_sectors(bio);
while (ci.sector_count && !error)
error = __split_and_process_non_flush(&ci);
}
/* drop the extra reference count */
dec_pending(ci.io, error);
}
/*-----------------------------------------------------------------
* CRUD END
*---------------------------------------------------------------*/
/*
* The request function that just remaps the bio built up by
* dm_merge_bvec.
*/
static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
{
int rw = bio_data_dir(bio);
struct mapped_device *md = q->queuedata;
int srcu_idx;
struct dm_table *map;
map = dm_get_live_table(md, &srcu_idx);
generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
/* if we're suspended, we have to queue this io for later */
if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
dm_put_live_table(md, srcu_idx);
if (!(bio->bi_opf & REQ_RAHEAD))
queue_io(md, bio);
else
bio_io_error(bio);
return BLK_QC_T_NONE;
}
__split_and_process_bio(md, map, bio);
dm_put_live_table(md, srcu_idx);
return BLK_QC_T_NONE;
}
static int dm_any_congested(void *congested_data, int bdi_bits)
{
int r = bdi_bits;
struct mapped_device *md = congested_data;
struct dm_table *map;
if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
if (dm_request_based(md)) {
/*
* With request-based DM we only need to check the
* top-level queue for congestion.
*/
r = md->queue->backing_dev_info->wb.state & bdi_bits;
} else {
map = dm_get_live_table_fast(md);
if (map)
r = dm_table_any_congested(map, bdi_bits);
dm_put_live_table_fast(md);
}
}
return r;
}
/*-----------------------------------------------------------------
* An IDR is used to keep track of allocated minor numbers.
*---------------------------------------------------------------*/
static void free_minor(int minor)
{
spin_lock(&_minor_lock);
idr_remove(&_minor_idr, minor);
spin_unlock(&_minor_lock);
}
/*
* See if the device with a specific minor # is free.
*/
static int specific_minor(int minor)
{
int r;
if (minor >= (1 << MINORBITS))
return -EINVAL;
idr_preload(GFP_KERNEL);
spin_lock(&_minor_lock);
r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
spin_unlock(&_minor_lock);
idr_preload_end();
if (r < 0)
return r == -ENOSPC ? -EBUSY : r;
return 0;
}
static int next_free_minor(int *minor)
{
int r;
idr_preload(GFP_KERNEL);
spin_lock(&_minor_lock);
r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
spin_unlock(&_minor_lock);
idr_preload_end();
if (r < 0)
return r;
*minor = r;
return 0;
}
static const struct block_device_operations dm_blk_dops;
static void dm_wq_work(struct work_struct *work);
void dm_init_md_queue(struct mapped_device *md)
{
/*
* Request-based dm devices cannot be stacked on top of bio-based dm
* devices. The type of this dm device may not have been decided yet.
* The type is decided at the first table loading time.
* To prevent problematic device stacking, clear the queue flag
* for request stacking support until then.
*
* This queue is new, so no concurrency on the queue_flags.
*/
queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
/*
* Initialize data that will only be used by a non-blk-mq DM queue
* - must do so here (in alloc_dev callchain) before queue is used
*/
md->queue->queuedata = md;
md->queue->backing_dev_info->congested_data = md;
}
void dm_init_normal_md_queue(struct mapped_device *md)
{
md->use_blk_mq = false;
dm_init_md_queue(md);
/*
* Initialize aspects of queue that aren't relevant for blk-mq
*/
md->queue->backing_dev_info->congested_fn = dm_any_congested;
blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
}
static void cleanup_mapped_device(struct mapped_device *md)
{
if (md->wq)
destroy_workqueue(md->wq);
if (md->kworker_task)
kthread_stop(md->kworker_task);
mempool_destroy(md->io_pool);
mempool_destroy(md->rq_pool);
if (md->bs)
bioset_free(md->bs);
if (md->disk) {
spin_lock(&_minor_lock);
md->disk->private_data = NULL;
spin_unlock(&_minor_lock);
del_gendisk(md->disk);
put_disk(md->disk);
}
if (md->queue)
blk_cleanup_queue(md->queue);
cleanup_srcu_struct(&md->io_barrier);
if (md->bdev) {
bdput(md->bdev);
md->bdev = NULL;
}
dm_mq_cleanup_mapped_device(md);
}
/*
* Allocate and initialise a blank device with a given minor.
*/
static struct mapped_device *alloc_dev(int minor)
{
int r, numa_node_id = dm_get_numa_node();
struct mapped_device *md;
void *old_md;
md = vzalloc_node(sizeof(*md), numa_node_id);
if (!md) {
DMWARN("unable to allocate device, out of memory.");
return NULL;
}
if (!try_module_get(THIS_MODULE))
goto bad_module_get;
/* get a minor number for the dev */
if (minor == DM_ANY_MINOR)
r = next_free_minor(&minor);
else
r = specific_minor(minor);
if (r < 0)
goto bad_minor;
r = init_srcu_struct(&md->io_barrier);
if (r < 0)
goto bad_io_barrier;
md->numa_node_id = numa_node_id;
md->use_blk_mq = dm_use_blk_mq_default();
md->init_tio_pdu = false;
md->type = DM_TYPE_NONE;
mutex_init(&md->suspend_lock);
mutex_init(&md->type_lock);
mutex_init(&md->table_devices_lock);
spin_lock_init(&md->deferred_lock);
atomic_set(&md->holders, 1);
atomic_set(&md->open_count, 0);
atomic_set(&md->event_nr, 0);
atomic_set(&md->uevent_seq, 0);
INIT_LIST_HEAD(&md->uevent_list);
INIT_LIST_HEAD(&md->table_devices);
spin_lock_init(&md->uevent_lock);
md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
if (!md->queue)
goto bad;
dm_init_md_queue(md);
md->disk = alloc_disk_node(1, numa_node_id);
if (!md->disk)
goto bad;
atomic_set(&md->pending[0], 0);
atomic_set(&md->pending[1], 0);
init_waitqueue_head(&md->wait);
INIT_WORK(&md->work, dm_wq_work);
init_waitqueue_head(&md->eventq);
init_completion(&md->kobj_holder.completion);
md->kworker_task = NULL;
md->disk->major = _major;
md->disk->first_minor = minor;
md->disk->fops = &dm_blk_dops;
md->disk->queue = md->queue;
md->disk->private_data = md;
sprintf(md->disk->disk_name, "dm-%d", minor);
add_disk(md->disk);
format_dev_t(md->name, MKDEV(_major, minor));
md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
if (!md->wq)
goto bad;
md->bdev = bdget_disk(md->disk, 0);
if (!md->bdev)
goto bad;
bio_init(&md->flush_bio);
md->flush_bio.bi_bdev = md->bdev;
bio_set_op_attrs(&md->flush_bio, REQ_OP_WRITE, WRITE_FLUSH);
dm_stats_init(&md->stats);
/* Populate the mapping, nobody knows we exist yet */
spin_lock(&_minor_lock);
old_md = idr_replace(&_minor_idr, md, minor);
spin_unlock(&_minor_lock);
BUG_ON(old_md != MINOR_ALLOCED);
return md;
bad:
cleanup_mapped_device(md);
bad_io_barrier:
free_minor(minor);
bad_minor:
module_put(THIS_MODULE);
bad_module_get:
kvfree(md);
return NULL;
}
static void unlock_fs(struct mapped_device *md);
static void free_dev(struct mapped_device *md)
{
int minor = MINOR(disk_devt(md->disk));
unlock_fs(md);
cleanup_mapped_device(md);
free_table_devices(&md->table_devices);
dm_stats_cleanup(&md->stats);
free_minor(minor);
module_put(THIS_MODULE);
kvfree(md);
}
static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
{
struct dm_md_mempools *p = dm_table_get_md_mempools(t);
if (md->bs) {
/* The md already has necessary mempools. */
if (dm_table_bio_based(t)) {
/*
* Reload bioset because front_pad may have changed
* because a different table was loaded.
*/
bioset_free(md->bs);
md->bs = p->bs;
p->bs = NULL;
}
/*
* There's no need to reload with request-based dm
* because the size of front_pad doesn't change.
* Note for future: If you are to reload bioset,
* prep-ed requests in the queue may refer
* to bio from the old bioset, so you must walk
* through the queue to unprep.
*/
goto out;
}
BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
md->io_pool = p->io_pool;
p->io_pool = NULL;
md->rq_pool = p->rq_pool;
p->rq_pool = NULL;
md->bs = p->bs;
p->bs = NULL;
out:
/* mempool bind completed, no longer need any mempools in the table */
dm_table_free_md_mempools(t);
}
/*
* Bind a table to the device.
*/
static void event_callback(void *context)
{
unsigned long flags;
LIST_HEAD(uevents);
struct mapped_device *md = (struct mapped_device *) context;
spin_lock_irqsave(&md->uevent_lock, flags);
list_splice_init(&md->uevent_list, &uevents);
spin_unlock_irqrestore(&md->uevent_lock, flags);
dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
atomic_inc(&md->event_nr);
wake_up(&md->eventq);
}
/*
* Protected by md->suspend_lock obtained by dm_swap_table().
*/
static void __set_size(struct mapped_device *md, sector_t size)
{
set_capacity(md->disk, size);
i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
}
/*
* Returns old map, which caller must destroy.
*/
static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
struct queue_limits *limits)
{
struct dm_table *old_map;
struct request_queue *q = md->queue;
sector_t size;
lockdep_assert_held(&md->suspend_lock);
size = dm_table_get_size(t);
/*
* Wipe any geometry if the size of the table changed.
*/
if (size != dm_get_size(md))
memset(&md->geometry, 0, sizeof(md->geometry));
__set_size(md, size);
dm_table_event_callback(t, event_callback, md);
/*
* The queue hasn't been stopped yet, if the old table type wasn't
* for request-based during suspension. So stop it to prevent
* I/O mapping before resume.
* This must be done before setting the queue restrictions,
* because request-based dm may be run just after the setting.
*/
if (dm_table_request_based(t)) {
dm_stop_queue(q);
/*
* Leverage the fact that request-based DM targets are
* immutable singletons and establish md->immutable_target
* - used to optimize both dm_request_fn and dm_mq_queue_rq
*/
md->immutable_target = dm_table_get_immutable_target(t);
}
__bind_mempools(md, t);
old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
rcu_assign_pointer(md->map, (void *)t);
md->immutable_target_type = dm_table_get_immutable_target_type(t);
dm_table_set_restrictions(t, q, limits);
if (old_map)
dm_sync_table(md);
return old_map;
}
/*
* Returns unbound table for the caller to free.
*/
static struct dm_table *__unbind(struct mapped_device *md)
{
struct dm_table *map = rcu_dereference_protected(md->map, 1);
if (!map)
return NULL;
dm_table_event_callback(map, NULL, NULL);
RCU_INIT_POINTER(md->map, NULL);
dm_sync_table(md);
return map;
}
/*
* Constructor for a new device.
*/
int dm_create(int minor, struct mapped_device **result)
{
struct mapped_device *md;
md = alloc_dev(minor);
if (!md)
return -ENXIO;
dm_sysfs_init(md);
*result = md;
return 0;
}
/*
* Functions to manage md->type.
* All are required to hold md->type_lock.
*/
void dm_lock_md_type(struct mapped_device *md)
{
mutex_lock(&md->type_lock);
}
void dm_unlock_md_type(struct mapped_device *md)
{
mutex_unlock(&md->type_lock);
}
void dm_set_md_type(struct mapped_device *md, unsigned type)
{
BUG_ON(!mutex_is_locked(&md->type_lock));
md->type = type;
}
unsigned dm_get_md_type(struct mapped_device *md)
{
return md->type;
}
struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
{
return md->immutable_target_type;
}
/*
* The queue_limits are only valid as long as you have a reference
* count on 'md'.
*/
struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
{
BUG_ON(!atomic_read(&md->holders));
return &md->queue->limits;
}
EXPORT_SYMBOL_GPL(dm_get_queue_limits);
/*
* Setup the DM device's queue based on md's type
*/
int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
{
int r;
unsigned type = dm_get_md_type(md);
switch (type) {
case DM_TYPE_REQUEST_BASED:
r = dm_old_init_request_queue(md);
if (r) {
DMERR("Cannot initialize queue for request-based mapped device");
return r;
}
break;
case DM_TYPE_MQ_REQUEST_BASED:
r = dm_mq_init_request_queue(md, t);
if (r) {
DMERR("Cannot initialize queue for request-based dm-mq mapped device");
return r;
}
break;
case DM_TYPE_BIO_BASED:
case DM_TYPE_DAX_BIO_BASED:
dm_init_normal_md_queue(md);
blk_queue_make_request(md->queue, dm_make_request);
/*
* DM handles splitting bios as needed. Free the bio_split bioset
* since it won't be used (saves 1 process per bio-based DM device).
*/
bioset_free(md->queue->bio_split);
md->queue->bio_split = NULL;
if (type == DM_TYPE_DAX_BIO_BASED)
queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
break;
}
return 0;
}
struct mapped_device *dm_get_md(dev_t dev)
{
struct mapped_device *md;
unsigned minor = MINOR(dev);
if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
return NULL;
spin_lock(&_minor_lock);
md = idr_find(&_minor_idr, minor);
if (md) {
if ((md == MINOR_ALLOCED ||
(MINOR(disk_devt(dm_disk(md))) != minor) ||
dm_deleting_md(md) ||
test_bit(DMF_FREEING, &md->flags))) {
md = NULL;
goto out;
}
dm_get(md);
}
out:
spin_unlock(&_minor_lock);
return md;
}
EXPORT_SYMBOL_GPL(dm_get_md);
void *dm_get_mdptr(struct mapped_device *md)
{
return md->interface_ptr;
}
void dm_set_mdptr(struct mapped_device *md, void *ptr)
{
md->interface_ptr = ptr;
}
void dm_get(struct mapped_device *md)
{
atomic_inc(&md->holders);
BUG_ON(test_bit(DMF_FREEING, &md->flags));
}
int dm_hold(struct mapped_device *md)
{
spin_lock(&_minor_lock);
if (test_bit(DMF_FREEING, &md->flags)) {
spin_unlock(&_minor_lock);
return -EBUSY;
}
dm_get(md);
spin_unlock(&_minor_lock);
return 0;
}
EXPORT_SYMBOL_GPL(dm_hold);
const char *dm_device_name(struct mapped_device *md)
{
return md->name;
}
EXPORT_SYMBOL_GPL(dm_device_name);
static void __dm_destroy(struct mapped_device *md, bool wait)
{
struct request_queue *q = dm_get_md_queue(md);
struct dm_table *map;
int srcu_idx;
might_sleep();
spin_lock(&_minor_lock);
idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
set_bit(DMF_FREEING, &md->flags);
spin_unlock(&_minor_lock);
spin_lock_irq(q->queue_lock);
queue_flag_set(QUEUE_FLAG_DYING, q);
spin_unlock_irq(q->queue_lock);
if (dm_request_based(md) && md->kworker_task)
kthread_flush_worker(&md->kworker);
/*
* Take suspend_lock so that presuspend and postsuspend methods
* do not race with internal suspend.
*/
mutex_lock(&md->suspend_lock);
map = dm_get_live_table(md, &srcu_idx);
if (!dm_suspended_md(md)) {
dm_table_presuspend_targets(map);
dm_table_postsuspend_targets(map);
}
/* dm_put_live_table must be before msleep, otherwise deadlock is possible */
dm_put_live_table(md, srcu_idx);
mutex_unlock(&md->suspend_lock);
/*
* Rare, but there may be I/O requests still going to complete,
* for example. Wait for all references to disappear.
* No one should increment the reference count of the mapped_device,
* after the mapped_device state becomes DMF_FREEING.
*/
if (wait)
while (atomic_read(&md->holders))
msleep(1);
else if (atomic_read(&md->holders))
DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
dm_device_name(md), atomic_read(&md->holders));
dm_sysfs_exit(md);
dm_table_destroy(__unbind(md));
free_dev(md);
}
void dm_destroy(struct mapped_device *md)
{
__dm_destroy(md, true);
}
void dm_destroy_immediate(struct mapped_device *md)
{
__dm_destroy(md, false);
}
void dm_put(struct mapped_device *md)
{
atomic_dec(&md->holders);
}
EXPORT_SYMBOL_GPL(dm_put);
static int dm_wait_for_completion(struct mapped_device *md, long task_state)
{
int r = 0;
DEFINE_WAIT(wait);
while (1) {
prepare_to_wait(&md->wait, &wait, task_state);
if (!md_in_flight(md))
break;
if (signal_pending_state(task_state, current)) {
r = -EINTR;
break;
}
io_schedule();
}
finish_wait(&md->wait, &wait);
return r;
}
/*
* Process the deferred bios
*/
static void dm_wq_work(struct work_struct *work)
{
struct mapped_device *md = container_of(work, struct mapped_device,
work);
struct bio *c;
int srcu_idx;
struct dm_table *map;
map = dm_get_live_table(md, &srcu_idx);
while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
spin_lock_irq(&md->deferred_lock);
c = bio_list_pop(&md->deferred);
spin_unlock_irq(&md->deferred_lock);
if (!c)
break;
if (dm_request_based(md))
generic_make_request(c);
else
__split_and_process_bio(md, map, c);
}
dm_put_live_table(md, srcu_idx);
}
static void dm_queue_flush(struct mapped_device *md)
{
clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
smp_mb__after_atomic();
queue_work(md->wq, &md->work);
}
/*
* Swap in a new table, returning the old one for the caller to destroy.
*/
struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
{
struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
struct queue_limits limits;
int r;
mutex_lock(&md->suspend_lock);
/* device must be suspended */
if (!dm_suspended_md(md))
goto out;
/*
* If the new table has no data devices, retain the existing limits.
* This helps multipath with queue_if_no_path if all paths disappear,
* then new I/O is queued based on these limits, and then some paths
* reappear.
*/
if (dm_table_has_no_data_devices(table)) {
live_map = dm_get_live_table_fast(md);
if (live_map)
limits = md->queue->limits;
dm_put_live_table_fast(md);
}
if (!live_map) {
r = dm_calculate_queue_limits(table, &limits);
if (r) {
map = ERR_PTR(r);
goto out;
}
}
map = __bind(md, table, &limits);
out:
mutex_unlock(&md->suspend_lock);
return map;
}
/*
* Functions to lock and unlock any filesystem running on the
* device.
*/
static int lock_fs(struct mapped_device *md)
{
int r;
WARN_ON(md->frozen_sb);
md->frozen_sb = freeze_bdev(md->bdev);
if (IS_ERR(md->frozen_sb)) {
r = PTR_ERR(md->frozen_sb);
md->frozen_sb = NULL;
return r;
}
set_bit(DMF_FROZEN, &md->flags);
return 0;
}
static void unlock_fs(struct mapped_device *md)
{
if (!test_bit(DMF_FROZEN, &md->flags))
return;
thaw_bdev(md->bdev, md->frozen_sb);
md->frozen_sb = NULL;
clear_bit(DMF_FROZEN, &md->flags);
}
/*
* @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
* @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
* @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
*
* If __dm_suspend returns 0, the device is completely quiescent
* now. There is no request-processing activity. All new requests
* are being added to md->deferred list.
*
* Caller must hold md->suspend_lock
*/
static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
unsigned suspend_flags, long task_state,
int dmf_suspended_flag)
{
bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
int r;
lockdep_assert_held(&md->suspend_lock);
/*
* DMF_NOFLUSH_SUSPENDING must be set before presuspend.
* This flag is cleared before dm_suspend returns.
*/
if (noflush)
set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
/*
* This gets reverted if there's an error later and the targets
* provide the .presuspend_undo hook.
*/
dm_table_presuspend_targets(map);
/*
* Flush I/O to the device.
* Any I/O submitted after lock_fs() may not be flushed.
* noflush takes precedence over do_lockfs.
* (lock_fs() flushes I/Os and waits for them to complete.)
*/
if (!noflush && do_lockfs) {
r = lock_fs(md);
if (r) {
dm_table_presuspend_undo_targets(map);
return r;
}
}
/*
* Here we must make sure that no processes are submitting requests
* to target drivers i.e. no one may be executing
* __split_and_process_bio. This is called from dm_request and
* dm_wq_work.
*
* To get all processes out of __split_and_process_bio in dm_request,
* we take the write lock. To prevent any process from reentering
* __split_and_process_bio from dm_request and quiesce the thread
* (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
* flush_workqueue(md->wq).
*/
set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
if (map)
synchronize_srcu(&md->io_barrier);
/*
* Stop md->queue before flushing md->wq in case request-based
* dm defers requests to md->wq from md->queue.
*/
if (dm_request_based(md)) {
dm_stop_queue(md->queue);
if (md->kworker_task)
kthread_flush_worker(&md->kworker);
}
flush_workqueue(md->wq);
/*
* At this point no more requests are entering target request routines.
* We call dm_wait_for_completion to wait for all existing requests
* to finish.
*/
r = dm_wait_for_completion(md, task_state);
if (!r)
set_bit(dmf_suspended_flag, &md->flags);
if (noflush)
clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
if (map)
synchronize_srcu(&md->io_barrier);
/* were we interrupted ? */
if (r < 0) {
dm_queue_flush(md);
if (dm_request_based(md))
dm_start_queue(md->queue);
unlock_fs(md);
dm_table_presuspend_undo_targets(map);
/* pushback list is already flushed, so skip flush */
}
return r;
}
/*
* We need to be able to change a mapping table under a mounted
* filesystem. For example we might want to move some data in
* the background. Before the table can be swapped with
* dm_bind_table, dm_suspend must be called to flush any in
* flight bios and ensure that any further io gets deferred.
*/
/*
* Suspend mechanism in request-based dm.
*
* 1. Flush all I/Os by lock_fs() if needed.
* 2. Stop dispatching any I/O by stopping the request_queue.
* 3. Wait for all in-flight I/Os to be completed or requeued.
*
* To abort suspend, start the request_queue.
*/
int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
{
struct dm_table *map = NULL;
int r = 0;
retry:
mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
if (dm_suspended_md(md)) {
r = -EINVAL;
goto out_unlock;
}
if (dm_suspended_internally_md(md)) {
/* already internally suspended, wait for internal resume */
mutex_unlock(&md->suspend_lock);
r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
if (r)
return r;
goto retry;
}
map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
if (r)
goto out_unlock;
dm_table_postsuspend_targets(map);
out_unlock:
mutex_unlock(&md->suspend_lock);
return r;
}
static int __dm_resume(struct mapped_device *md, struct dm_table *map)
{
if (map) {
int r = dm_table_resume_targets(map);
if (r)
return r;
}
dm_queue_flush(md);
/*
* Flushing deferred I/Os must be done after targets are resumed
* so that mapping of targets can work correctly.
* Request-based dm is queueing the deferred I/Os in its request_queue.
*/
if (dm_request_based(md))
dm_start_queue(md->queue);
unlock_fs(md);
return 0;
}
int dm_resume(struct mapped_device *md)
{
int r;
struct dm_table *map = NULL;
retry:
r = -EINVAL;
mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
if (!dm_suspended_md(md))
goto out;
if (dm_suspended_internally_md(md)) {
/* already internally suspended, wait for internal resume */
mutex_unlock(&md->suspend_lock);
r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
if (r)
return r;
goto retry;
}
map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
if (!map || !dm_table_get_size(map))
goto out;
r = __dm_resume(md, map);
if (r)
goto out;
clear_bit(DMF_SUSPENDED, &md->flags);
out:
mutex_unlock(&md->suspend_lock);
return r;
}
/*
* Internal suspend/resume works like userspace-driven suspend. It waits
* until all bios finish and prevents issuing new bios to the target drivers.
* It may be used only from the kernel.
*/
static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
{
struct dm_table *map = NULL;
if (md->internal_suspend_count++)
return; /* nested internal suspend */
if (dm_suspended_md(md)) {
set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
return; /* nest suspend */
}
map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
/*
* Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
* supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend
* would require changing .presuspend to return an error -- avoid this
* until there is a need for more elaborate variants of internal suspend.
*/
(void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
DMF_SUSPENDED_INTERNALLY);
dm_table_postsuspend_targets(map);
}
static void __dm_internal_resume(struct mapped_device *md)
{
BUG_ON(!md->internal_suspend_count);
if (--md->internal_suspend_count)
return; /* resume from nested internal suspend */
if (dm_suspended_md(md))
goto done; /* resume from nested suspend */
/*
* NOTE: existing callers don't need to call dm_table_resume_targets
* (which may fail -- so best to avoid it for now by passing NULL map)
*/
(void) __dm_resume(md, NULL);
done:
clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
smp_mb__after_atomic();
wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
}
void dm_internal_suspend_noflush(struct mapped_device *md)
{
mutex_lock(&md->suspend_lock);
__dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
mutex_unlock(&md->suspend_lock);
}
EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
void dm_internal_resume(struct mapped_device *md)
{
mutex_lock(&md->suspend_lock);
__dm_internal_resume(md);
mutex_unlock(&md->suspend_lock);
}
EXPORT_SYMBOL_GPL(dm_internal_resume);
/*
* Fast variants of internal suspend/resume hold md->suspend_lock,
* which prevents interaction with userspace-driven suspend.
*/
void dm_internal_suspend_fast(struct mapped_device *md)
{
mutex_lock(&md->suspend_lock);
if (dm_suspended_md(md) || dm_suspended_internally_md(md))
return;
set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
synchronize_srcu(&md->io_barrier);
flush_workqueue(md->wq);
dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
void dm_internal_resume_fast(struct mapped_device *md)
{
if (dm_suspended_md(md) || dm_suspended_internally_md(md))
goto done;
dm_queue_flush(md);
done:
mutex_unlock(&md->suspend_lock);
}
EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
/*-----------------------------------------------------------------
* Event notification.
*---------------------------------------------------------------*/
int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
unsigned cookie)
{
char udev_cookie[DM_COOKIE_LENGTH];
char *envp[] = { udev_cookie, NULL };
if (!cookie)
return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
else {
snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
DM_COOKIE_ENV_VAR_NAME, cookie);
return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
action, envp);
}
}
uint32_t dm_next_uevent_seq(struct mapped_device *md)
{
return atomic_add_return(1, &md->uevent_seq);
}
uint32_t dm_get_event_nr(struct mapped_device *md)
{
return atomic_read(&md->event_nr);
}
int dm_wait_event(struct mapped_device *md, int event_nr)
{
return wait_event_interruptible(md->eventq,
(event_nr != atomic_read(&md->event_nr)));
}
void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
{
unsigned long flags;
spin_lock_irqsave(&md->uevent_lock, flags);
list_add(elist, &md->uevent_list);
spin_unlock_irqrestore(&md->uevent_lock, flags);
}
/*
* The gendisk is only valid as long as you have a reference
* count on 'md'.
*/
struct gendisk *dm_disk(struct mapped_device *md)
{
return md->disk;
}
EXPORT_SYMBOL_GPL(dm_disk);
struct kobject *dm_kobject(struct mapped_device *md)
{
return &md->kobj_holder.kobj;
}
struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
{
struct mapped_device *md;
md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
spin_lock(&_minor_lock);
if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
md = NULL;
goto out;
}
dm_get(md);
out:
spin_unlock(&_minor_lock);
return md;
}
int dm_suspended_md(struct mapped_device *md)
{
return test_bit(DMF_SUSPENDED, &md->flags);
}
int dm_suspended_internally_md(struct mapped_device *md)
{
return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
}
int dm_test_deferred_remove_flag(struct mapped_device *md)
{
return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
}
int dm_suspended(struct dm_target *ti)
{
return dm_suspended_md(dm_table_get_md(ti->table));
}
EXPORT_SYMBOL_GPL(dm_suspended);
int dm_noflush_suspending(struct dm_target *ti)
{
return __noflush_suspending(dm_table_get_md(ti->table));
}
EXPORT_SYMBOL_GPL(dm_noflush_suspending);
struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
unsigned integrity, unsigned per_io_data_size)
{
struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
struct kmem_cache *cachep = NULL;
unsigned int pool_size = 0;
unsigned int front_pad;
if (!pools)
return NULL;
switch (type) {
case DM_TYPE_BIO_BASED:
case DM_TYPE_DAX_BIO_BASED:
cachep = _io_cache;
pool_size = dm_get_reserved_bio_based_ios();
front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
break;
case DM_TYPE_REQUEST_BASED:
cachep = _rq_tio_cache;
pool_size = dm_get_reserved_rq_based_ios();
pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
if (!pools->rq_pool)
goto out;
/* fall through to setup remaining rq-based pools */
case DM_TYPE_MQ_REQUEST_BASED:
if (!pool_size)
pool_size = dm_get_reserved_rq_based_ios();
front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
/* per_io_data_size is used for blk-mq pdu at queue allocation */
break;
default:
BUG();
}
if (cachep) {
pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
if (!pools->io_pool)
goto out;
}
pools->bs = bioset_create_nobvec(pool_size, front_pad);
if (!pools->bs)
goto out;
if (integrity && bioset_integrity_create(pools->bs, pool_size))
goto out;
return pools;
out:
dm_free_md_mempools(pools);
return NULL;
}
void dm_free_md_mempools(struct dm_md_mempools *pools)
{
if (!pools)
return;
mempool_destroy(pools->io_pool);
mempool_destroy(pools->rq_pool);
if (pools->bs)
bioset_free(pools->bs);
kfree(pools);
}
struct dm_pr {
u64 old_key;
u64 new_key;
u32 flags;
bool fail_early;
};
static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
void *data)
{
struct mapped_device *md = bdev->bd_disk->private_data;
struct dm_table *table;
struct dm_target *ti;
int ret = -ENOTTY, srcu_idx;
table = dm_get_live_table(md, &srcu_idx);
if (!table || !dm_table_get_size(table))
goto out;
/* We only support devices that have a single target */
if (dm_table_get_num_targets(table) != 1)
goto out;
ti = dm_table_get_target(table, 0);
ret = -EINVAL;
if (!ti->type->iterate_devices)
goto out;
ret = ti->type->iterate_devices(ti, fn, data);
out:
dm_put_live_table(md, srcu_idx);
return ret;
}
/*
* For register / unregister we need to manually call out to every path.
*/
static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
struct dm_pr *pr = data;
const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
if (!ops || !ops->pr_register)
return -EOPNOTSUPP;
return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
}
static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
u32 flags)
{
struct dm_pr pr = {
.old_key = old_key,
.new_key = new_key,
.flags = flags,
.fail_early = true,
};
int ret;
ret = dm_call_pr(bdev, __dm_pr_register, &pr);
if (ret && new_key) {
/* unregister all paths if we failed to register any path */
pr.old_key = new_key;
pr.new_key = 0;
pr.flags = 0;
pr.fail_early = false;
dm_call_pr(bdev, __dm_pr_register, &pr);
}
return ret;
}
static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
u32 flags)
{
struct mapped_device *md = bdev->bd_disk->private_data;
const struct pr_ops *ops;
fmode_t mode;
int r;
r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
if (r < 0)
return r;
ops = bdev->bd_disk->fops->pr_ops;
if (ops && ops->pr_reserve)
r = ops->pr_reserve(bdev, key, type, flags);
else
r = -EOPNOTSUPP;
bdput(bdev);
return r;
}
static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
{
struct mapped_device *md = bdev->bd_disk->private_data;
const struct pr_ops *ops;
fmode_t mode;
int r;
r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
if (r < 0)
return r;
ops = bdev->bd_disk->fops->pr_ops;
if (ops && ops->pr_release)
r = ops->pr_release(bdev, key, type);
else
r = -EOPNOTSUPP;
bdput(bdev);
return r;
}
static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
enum pr_type type, bool abort)
{
struct mapped_device *md = bdev->bd_disk->private_data;
const struct pr_ops *ops;
fmode_t mode;
int r;
r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
if (r < 0)
return r;
ops = bdev->bd_disk->fops->pr_ops;
if (ops && ops->pr_preempt)
r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
else
r = -EOPNOTSUPP;
bdput(bdev);
return r;
}
static int dm_pr_clear(struct block_device *bdev, u64 key)
{
struct mapped_device *md = bdev->bd_disk->private_data;
const struct pr_ops *ops;
fmode_t mode;
int r;
r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
if (r < 0)
return r;
ops = bdev->bd_disk->fops->pr_ops;
if (ops && ops->pr_clear)
r = ops->pr_clear(bdev, key);
else
r = -EOPNOTSUPP;
bdput(bdev);
return r;
}
static const struct pr_ops dm_pr_ops = {
.pr_register = dm_pr_register,
.pr_reserve = dm_pr_reserve,
.pr_release = dm_pr_release,
.pr_preempt = dm_pr_preempt,
.pr_clear = dm_pr_clear,
};
static const struct block_device_operations dm_blk_dops = {
.open = dm_blk_open,
.release = dm_blk_close,
.ioctl = dm_blk_ioctl,
.direct_access = dm_blk_direct_access,
.getgeo = dm_blk_getgeo,
.pr_ops = &dm_pr_ops,
.owner = THIS_MODULE
};
/*
* module hooks
*/
module_init(dm_init);
module_exit(dm_exit);
module_param(major, uint, 0);
MODULE_PARM_DESC(major, "The major number of the device mapper");
module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
MODULE_DESCRIPTION(DM_NAME " driver");
MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");