// SPDX-License-Identifier: GPL-2.0-only /* * IBM Accelerator Family 'GenWQE' * * (C) Copyright IBM Corp. 2013 * * Author: Frank Haverkamp <[email protected]> * Author: Joerg-Stephan Vogt <[email protected]> * Author: Michael Jung <[email protected]> * Author: Michael Ruettger <[email protected]> */ /* * Module initialization and PCIe setup. Card health monitoring and * recovery functionality. Character device creation and deletion are * controlled from here. */ #include <linux/types.h> #include <linux/pci.h> #include <linux/err.h> #include <linux/string.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/delay.h> #include <linux/dma-mapping.h> #include <linux/module.h> #include <linux/notifier.h> #include <linux/device.h> #include <linux/log2.h> #include "card_base.h" #include "card_ddcb.h" MODULE_AUTHOR(…) …; MODULE_AUTHOR(…) …; MODULE_AUTHOR(…) …; MODULE_AUTHOR(…) …; MODULE_DESCRIPTION(…) …; MODULE_VERSION(…); MODULE_LICENSE(…) …; static char genwqe_driver_name[] = …; static struct dentry *debugfs_genwqe; static struct genwqe_dev *genwqe_devices[GENWQE_CARD_NO_MAX]; /* PCI structure for identifying device by PCI vendor and device ID */ static const struct pci_device_id genwqe_device_table[] = …; MODULE_DEVICE_TABLE(pci, genwqe_device_table); /** * genwqe_devnode() - Set default access mode for genwqe devices. * @dev: Pointer to device (unused) * @mode: Carrier to pass-back given mode (permissions) * * Default mode should be rw for everybody. Do not change default * device name. */ static char *genwqe_devnode(const struct device *dev, umode_t *mode) { … } static const struct class class_genwqe = …; /** * genwqe_dev_alloc() - Create and prepare a new card descriptor * * Return: Pointer to card descriptor, or ERR_PTR(err) on error */ static struct genwqe_dev *genwqe_dev_alloc(void) { … } static void genwqe_dev_free(struct genwqe_dev *cd) { … } /** * genwqe_bus_reset() - Card recovery * @cd: GenWQE device information * * pci_reset_function() will recover the device and ensure that the * registers are accessible again when it completes with success. If * not, the card will stay dead and registers will be unaccessible * still. */ static int genwqe_bus_reset(struct genwqe_dev *cd) { … } /* * Hardware circumvention section. Certain bitstreams in our test-lab * had different kinds of problems. Here is where we adjust those * bitstreams to function will with this version of our device driver. * * Thise circumventions are applied to the physical function only. * The magical numbers below are identifying development/manufacturing * versions of the bitstream used on the card. * * Turn off error reporting for old/manufacturing images. */ bool genwqe_need_err_masking(struct genwqe_dev *cd) { … } static void genwqe_tweak_hardware(struct genwqe_dev *cd) { … } /** * genwqe_recovery_on_fatal_gfir_required() - Version depended actions * @cd: GenWQE device information * * Bitstreams older than 2013-02-17 have a bug where fatal GFIRs must * be ignored. This is e.g. true for the bitstream we gave to the card * manufacturer, but also for some old bitstreams we released to our * test-lab. */ int genwqe_recovery_on_fatal_gfir_required(struct genwqe_dev *cd) { … } int genwqe_flash_readback_fails(struct genwqe_dev *cd) { … } /** * genwqe_T_psec() - Calculate PF/VF timeout register content * @cd: GenWQE device information * * Note: From a design perspective it turned out to be a bad idea to * use codes here to specifiy the frequency/speed values. An old * driver cannot understand new codes and is therefore always a * problem. Better is to measure out the value or put the * speed/frequency directly into a register which is always a valid * value for old as well as for new software. */ /* T = 1/f */ static int genwqe_T_psec(struct genwqe_dev *cd) { … } /** * genwqe_setup_pf_jtimer() - Setup PF hardware timeouts for DDCB execution * @cd: GenWQE device information * * Do this _after_ card_reset() is called. Otherwise the values will * vanish. The settings need to be done when the queues are inactive. * * The max. timeout value is 2^(10+x) * T (6ns for 166MHz) * 15/16. * The min. timeout value is 2^(10+x) * T (6ns for 166MHz) * 14/16. */ static bool genwqe_setup_pf_jtimer(struct genwqe_dev *cd) { … } /** * genwqe_setup_vf_jtimer() - Setup VF hardware timeouts for DDCB execution * @cd: GenWQE device information */ static bool genwqe_setup_vf_jtimer(struct genwqe_dev *cd) { … } static int genwqe_ffdc_buffs_alloc(struct genwqe_dev *cd) { … } static void genwqe_ffdc_buffs_free(struct genwqe_dev *cd) { … } static int genwqe_read_ids(struct genwqe_dev *cd) { … } static int genwqe_start(struct genwqe_dev *cd) { … } /** * genwqe_stop() - Stop card operation * @cd: GenWQE device information * * Recovery notes: * As long as genwqe_thread runs we might access registers during * error data capture. Same is with the genwqe_health_thread. * When genwqe_bus_reset() fails this function might called two times: * first by the genwqe_health_thread() and later by genwqe_remove() to * unbind the device. We must be able to survive that. * * This function must be robust enough to be called twice. */ static int genwqe_stop(struct genwqe_dev *cd) { … } /** * genwqe_recover_card() - Try to recover the card if it is possible * @cd: GenWQE device information * @fatal_err: Indicate whether to attempt soft reset * * If fatal_err is set no register access is possible anymore. It is * likely that genwqe_start fails in that situation. Proper error * handling is required in this case. * * genwqe_bus_reset() will cause the pci code to call genwqe_remove() * and later genwqe_probe() for all virtual functions. */ static int genwqe_recover_card(struct genwqe_dev *cd, int fatal_err) { … } static int genwqe_health_check_cond(struct genwqe_dev *cd, u64 *gfir) { … } /** * genwqe_fir_checking() - Check the fault isolation registers of the card * @cd: GenWQE device information * * If this code works ok, can be tried out with help of the genwqe_poke tool: * sudo ./tools/genwqe_poke 0x8 0xfefefefefef * * Now the relevant FIRs/sFIRs should be printed out and the driver should * invoke recovery (devices are removed and readded). */ static u64 genwqe_fir_checking(struct genwqe_dev *cd) { … } /** * genwqe_pci_fundamental_reset() - trigger a PCIe fundamental reset on the slot * @pci_dev: PCI device information struct * * Note: pci_set_pcie_reset_state() is not implemented on all archs, so this * reset method will not work in all cases. * * Return: 0 on success or error code from pci_set_pcie_reset_state() */ static int genwqe_pci_fundamental_reset(struct pci_dev *pci_dev) { … } static int genwqe_platform_recovery(struct genwqe_dev *cd) { … } /** * genwqe_reload_bistream() - reload card bitstream * @cd: GenWQE device information * * Set the appropriate register and call fundamental reset to reaload the card * bitstream. * * Return: 0 on success, error code otherwise */ static int genwqe_reload_bistream(struct genwqe_dev *cd) { … } /** * genwqe_health_thread() - Health checking thread * @data: GenWQE device information * * This thread is only started for the PF of the card. * * This thread monitors the health of the card. A critical situation * is when we read registers which contain -1 (IO_ILLEGAL_VALUE). In * this case we need to be recovered from outside. Writing to * registers will very likely not work either. * * This thread must only exit if kthread_should_stop() becomes true. * * Condition for the health-thread to trigger: * a) when a kthread_stop() request comes in or * b) a critical GFIR occured * * Informational GFIRs are checked and potentially printed in * GENWQE_HEALTH_CHECK_INTERVAL seconds. */ static int genwqe_health_thread(void *data) { … } static int genwqe_health_check_start(struct genwqe_dev *cd) { … } static int genwqe_health_thread_running(struct genwqe_dev *cd) { … } static int genwqe_health_check_stop(struct genwqe_dev *cd) { … } /** * genwqe_pci_setup() - Allocate PCIe related resources for our card * @cd: GenWQE device information */ static int genwqe_pci_setup(struct genwqe_dev *cd) { … } /** * genwqe_pci_remove() - Free PCIe related resources for our card * @cd: GenWQE device information */ static void genwqe_pci_remove(struct genwqe_dev *cd) { … } /** * genwqe_probe() - Device initialization * @pci_dev: PCI device information struct * @id: PCI device ID * * Callable for multiple cards. This function is called on bind. * * Return: 0 if succeeded, < 0 when failed */ static int genwqe_probe(struct pci_dev *pci_dev, const struct pci_device_id *id) { … } /** * genwqe_remove() - Called when device is removed (hot-plugable) * @pci_dev: PCI device information struct * * Or when driver is unloaded respecitively when unbind is done. */ static void genwqe_remove(struct pci_dev *pci_dev) { … } /** * genwqe_err_error_detected() - Error detection callback * @pci_dev: PCI device information struct * @state: PCI channel state * * This callback is called by the PCI subsystem whenever a PCI bus * error is detected. */ static pci_ers_result_t genwqe_err_error_detected(struct pci_dev *pci_dev, pci_channel_state_t state) { … } static pci_ers_result_t genwqe_err_slot_reset(struct pci_dev *pci_dev) { … } static pci_ers_result_t genwqe_err_result_none(struct pci_dev *dev) { … } static void genwqe_err_resume(struct pci_dev *pci_dev) { … } static int genwqe_sriov_configure(struct pci_dev *dev, int numvfs) { … } static const struct pci_error_handlers genwqe_err_handler = …; static struct pci_driver genwqe_driver = …; /** * genwqe_init_module() - Driver registration and initialization */ static int __init genwqe_init_module(void) { … } /** * genwqe_exit_module() - Driver exit */ static void __exit genwqe_exit_module(void) { … } module_init(…) …; module_exit(genwqe_exit_module);