/* Copyright 2009 - 2016 Freescale Semiconductor, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Freescale Semiconductor nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * ALTERNATIVELY, this software may be distributed under the terms of the * GNU General Public License ("GPL") as published by the Free Software * Foundation, either version 2 of that License or (at your option) any * later version. * * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "qman_test.h" #include <linux/dma-mapping.h> #include <linux/delay.h> /* * Algorithm: * * Each cpu will have HP_PER_CPU "handlers" set up, each of which incorporates * an rx/tx pair of FQ objects (both of which are stashed on dequeue). The * organisation of FQIDs is such that the HP_PER_CPU*NUM_CPUS handlers will * shuttle a "hot potato" frame around them such that every forwarding action * moves it from one cpu to another. (The use of more than one handler per cpu * is to allow enough handlers/FQs to truly test the significance of caching - * ie. when cache-expiries are occurring.) * * The "hot potato" frame content will be HP_NUM_WORDS*4 bytes in size, and the * first and last words of the frame data will undergo a transformation step on * each forwarding action. To achieve this, each handler will be assigned a * 32-bit "mixer", that is produced using a 32-bit LFSR. When a frame is * received by a handler, the mixer of the expected sender is XOR'd into all * words of the entire frame, which is then validated against the original * values. Then, before forwarding, the entire frame is XOR'd with the mixer of * the current handler. Apart from validating that the frame is taking the * expected path, this also provides some quasi-realistic overheads to each * forwarding action - dereferencing *all* the frame data, computation, and * conditional branching. There is a "special" handler designated to act as the * instigator of the test by creating an enqueuing the "hot potato" frame, and * to determine when the test has completed by counting HP_LOOPS iterations. * * Init phases: * * 1. prepare each cpu's 'hp_cpu' struct using on_each_cpu(,,1) and link them * into 'hp_cpu_list'. Specifically, set processor_id, allocate HP_PER_CPU * handlers and link-list them (but do no other handler setup). * * 2. scan over 'hp_cpu_list' HP_PER_CPU times, the first time sets each * hp_cpu's 'iterator' to point to its first handler. With each loop, * allocate rx/tx FQIDs and mixer values to the hp_cpu's iterator handler * and advance the iterator for the next loop. This includes a final fixup, * which connects the last handler to the first (and which is why phase 2 * and 3 are separate). * * 3. scan over 'hp_cpu_list' HP_PER_CPU times, the first time sets each * hp_cpu's 'iterator' to point to its first handler. With each loop, * initialise FQ objects and advance the iterator for the next loop. * Moreover, do this initialisation on the cpu it applies to so that Rx FQ * initialisation targets the correct cpu. */ /* * helper to run something on all cpus (can't use on_each_cpu(), as that invokes * the fn from irq context, which is too restrictive). */ struct bstrap { … }; static int bstrap_fn(void *bs) { … } static int on_all_cpus(int (*fn)(void)) { … } struct hp_handler { … } ____cacheline_aligned; struct hp_cpu { … }; /* Each cpu has one of these */ static DEFINE_PER_CPU(struct hp_cpu, hp_cpus); /* links together the hp_cpu structs, in first-come first-serve order. */ static LIST_HEAD(hp_cpu_list); static DEFINE_SPINLOCK(hp_lock); static unsigned int hp_cpu_list_length; /* the "special" handler, that starts and terminates the test. */ static struct hp_handler *special_handler; static int loop_counter; /* handlers are allocated out of this, so they're properly aligned. */ static struct kmem_cache *hp_handler_slab; /* this is the frame data */ static void *__frame_ptr; static u32 *frame_ptr; static dma_addr_t frame_dma; /* needed for dma_map*() */ static const struct qm_portal_config *pcfg; /* the main function waits on this */ static DECLARE_WAIT_QUEUE_HEAD(queue); #define HP_PER_CPU … #define HP_LOOPS … /* 80 bytes, like a small ethernet frame, and bleeds into a second cacheline */ #define HP_NUM_WORDS … /* First word of the LFSR-based frame data */ #define HP_FIRST_WORD … static inline u32 do_lfsr(u32 prev) { … } static int allocate_frame_data(void) { … } static void deallocate_frame_data(void) { … } static inline int process_frame_data(struct hp_handler *handler, const struct qm_fd *fd) { … } static enum qman_cb_dqrr_result normal_dqrr(struct qman_portal *portal, struct qman_fq *fq, const struct qm_dqrr_entry *dqrr, bool sched_napi) { … } static enum qman_cb_dqrr_result special_dqrr(struct qman_portal *portal, struct qman_fq *fq, const struct qm_dqrr_entry *dqrr, bool sched_napi) { … } static int create_per_cpu_handlers(void) { … } static int destroy_per_cpu_handlers(void) { … } static inline u8 num_cachelines(u32 offset) { … } #define STASH_DATA_CL … #define STASH_CTX_CL … static int init_handler(void *h) { … } static void init_handler_cb(void *h) { … } static int init_phase2(void) { … } static int init_phase3(void) { … } static int send_first_frame(void *ignore) { … } static void send_first_frame_cb(void *ignore) { … } int qman_test_stash(void) { … }