// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
#include <linux/mlx5/fs.h>
#include "en/mapping.h"
#include "en/tc/int_port.h"
#include "en.h"
#include "en_rep.h"
#include "en_tc.h"
struct mlx5e_tc_int_port {
enum mlx5e_tc_int_port_type type;
int ifindex;
u32 match_metadata;
u32 mapping;
struct list_head list;
struct mlx5_flow_handle *rx_rule;
refcount_t refcnt;
struct rcu_head rcu_head;
};
struct mlx5e_tc_int_port_priv {
struct mlx5_core_dev *dev;
struct mutex int_ports_lock; /* Protects int ports list */
struct list_head int_ports; /* Uses int_ports_lock */
u16 num_ports;
bool ul_rep_rx_ready; /* Set when uplink is performing teardown */
struct mapping_ctx *metadata_mapping; /* Metadata for source port rewrite and matching */
};
bool mlx5e_tc_int_port_supported(const struct mlx5_eswitch *esw)
{
return mlx5_eswitch_vport_match_metadata_enabled(esw) &&
MLX5_CAP_GEN(esw->dev, reg_c_preserve);
}
u32 mlx5e_tc_int_port_get_metadata(struct mlx5e_tc_int_port *int_port)
{
return int_port->match_metadata;
}
int mlx5e_tc_int_port_get_flow_source(struct mlx5e_tc_int_port *int_port)
{
/* For egress forwarding we can have the case
* where the packet came from a vport and redirected
* to int port or it came from the uplink, going
* via internal port and hairpinned back to uplink
* so we set the source to any port in this case.
*/
return int_port->type == MLX5E_TC_INT_PORT_EGRESS ?
MLX5_FLOW_CONTEXT_FLOW_SOURCE_ANY_VPORT :
MLX5_FLOW_CONTEXT_FLOW_SOURCE_UPLINK;
}
u32 mlx5e_tc_int_port_get_metadata_for_match(struct mlx5e_tc_int_port *int_port)
{
return int_port->match_metadata << (32 - ESW_SOURCE_PORT_METADATA_BITS);
}
static struct mlx5_flow_handle *
mlx5e_int_port_create_rx_rule(struct mlx5_eswitch *esw,
struct mlx5e_tc_int_port *int_port,
struct mlx5_flow_destination *dest)
{
struct mlx5_flow_context *flow_context;
struct mlx5_flow_act flow_act = {};
struct mlx5_flow_handle *flow_rule;
struct mlx5_flow_spec *spec;
void *misc;
spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
if (!spec)
return ERR_PTR(-ENOMEM);
misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters_2);
MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
mlx5e_tc_int_port_get_metadata_for_match(int_port));
misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters_2);
MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
mlx5_eswitch_get_vport_metadata_mask());
spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2;
/* Overwrite flow tag with the int port metadata mapping
* instead of the chain mapping.
*/
flow_context = &spec->flow_context;
flow_context->flags |= FLOW_CONTEXT_HAS_TAG;
flow_context->flow_tag = int_port->mapping;
flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
flow_rule = mlx5_add_flow_rules(esw->offloads.ft_offloads, spec,
&flow_act, dest, 1);
if (IS_ERR(flow_rule))
mlx5_core_warn(esw->dev, "ft offloads: Failed to add internal vport rx rule err %ld\n",
PTR_ERR(flow_rule));
kvfree(spec);
return flow_rule;
}
static struct mlx5e_tc_int_port *
mlx5e_int_port_lookup(struct mlx5e_tc_int_port_priv *priv,
int ifindex,
enum mlx5e_tc_int_port_type type)
{
struct mlx5e_tc_int_port *int_port;
if (!priv->ul_rep_rx_ready)
goto not_found;
list_for_each_entry(int_port, &priv->int_ports, list)
if (int_port->ifindex == ifindex && int_port->type == type) {
refcount_inc(&int_port->refcnt);
return int_port;
}
not_found:
return NULL;
}
static int mlx5e_int_port_metadata_alloc(struct mlx5e_tc_int_port_priv *priv,
int ifindex, enum mlx5e_tc_int_port_type type,
u32 *id)
{
u32 mapped_key[2] = {type, ifindex};
int err;
err = mapping_add(priv->metadata_mapping, mapped_key, id);
if (err)
return err;
/* Fill upper 4 bits of PFNUM with reserved value */
*id |= 0xf << ESW_VPORT_BITS;
return 0;
}
static void mlx5e_int_port_metadata_free(struct mlx5e_tc_int_port_priv *priv,
u32 id)
{
id &= (1 << ESW_VPORT_BITS) - 1;
mapping_remove(priv->metadata_mapping, id);
}
/* Must be called with priv->int_ports_lock held */
static struct mlx5e_tc_int_port *
mlx5e_int_port_add(struct mlx5e_tc_int_port_priv *priv,
int ifindex,
enum mlx5e_tc_int_port_type type)
{
struct mlx5_eswitch *esw = priv->dev->priv.eswitch;
struct mlx5_mapped_obj mapped_obj = {};
struct mlx5e_rep_priv *uplink_rpriv;
struct mlx5e_tc_int_port *int_port;
struct mlx5_flow_destination dest;
struct mapping_ctx *ctx;
u32 match_metadata;
u32 mapping;
int err;
if (priv->num_ports == MLX5E_TC_MAX_INT_PORT_NUM) {
mlx5_core_dbg(priv->dev, "Cannot add a new int port, max supported %d",
MLX5E_TC_MAX_INT_PORT_NUM);
return ERR_PTR(-ENOSPC);
}
int_port = kzalloc(sizeof(*int_port), GFP_KERNEL);
if (!int_port)
return ERR_PTR(-ENOMEM);
err = mlx5e_int_port_metadata_alloc(priv, ifindex, type, &match_metadata);
if (err) {
mlx5_core_warn(esw->dev, "Cannot add a new internal port, metadata allocation failed for ifindex %d",
ifindex);
goto err_metadata;
}
/* map metadata to reg_c0 object for miss handling */
ctx = esw->offloads.reg_c0_obj_pool;
mapped_obj.type = MLX5_MAPPED_OBJ_INT_PORT_METADATA;
mapped_obj.int_port_metadata = match_metadata;
err = mapping_add(ctx, &mapped_obj, &mapping);
if (err)
goto err_map;
int_port->type = type;
int_port->ifindex = ifindex;
int_port->match_metadata = match_metadata;
int_port->mapping = mapping;
/* Create a match on internal vport metadata in vport table */
uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
dest.ft = uplink_rpriv->root_ft;
int_port->rx_rule = mlx5e_int_port_create_rx_rule(esw, int_port, &dest);
if (IS_ERR(int_port->rx_rule)) {
err = PTR_ERR(int_port->rx_rule);
mlx5_core_warn(esw->dev, "Can't add internal port rx rule, err %d", err);
goto err_rx_rule;
}
refcount_set(&int_port->refcnt, 1);
list_add_rcu(&int_port->list, &priv->int_ports);
priv->num_ports++;
return int_port;
err_rx_rule:
mapping_remove(ctx, int_port->mapping);
err_map:
mlx5e_int_port_metadata_free(priv, match_metadata);
err_metadata:
kfree(int_port);
return ERR_PTR(err);
}
/* Must be called with priv->int_ports_lock held */
static void
mlx5e_int_port_remove(struct mlx5e_tc_int_port_priv *priv,
struct mlx5e_tc_int_port *int_port)
{
struct mlx5_eswitch *esw = priv->dev->priv.eswitch;
struct mapping_ctx *ctx;
ctx = esw->offloads.reg_c0_obj_pool;
list_del_rcu(&int_port->list);
/* The following parameters are not used by the
* rcu readers of this int_port object so it is
* safe to release them.
*/
if (int_port->rx_rule)
mlx5_del_flow_rules(int_port->rx_rule);
mapping_remove(ctx, int_port->mapping);
mlx5e_int_port_metadata_free(priv, int_port->match_metadata);
kfree_rcu_mightsleep(int_port);
priv->num_ports--;
}
/* Must be called with rcu_read_lock held */
static struct mlx5e_tc_int_port *
mlx5e_int_port_get_from_metadata(struct mlx5e_tc_int_port_priv *priv,
u32 metadata)
{
struct mlx5e_tc_int_port *int_port;
list_for_each_entry_rcu(int_port, &priv->int_ports, list)
if (int_port->match_metadata == metadata)
return int_port;
return NULL;
}
struct mlx5e_tc_int_port *
mlx5e_tc_int_port_get(struct mlx5e_tc_int_port_priv *priv,
int ifindex,
enum mlx5e_tc_int_port_type type)
{
struct mlx5e_tc_int_port *int_port;
if (!priv)
return ERR_PTR(-EOPNOTSUPP);
mutex_lock(&priv->int_ports_lock);
/* Reject request if ul rep not ready */
if (!priv->ul_rep_rx_ready) {
int_port = ERR_PTR(-EOPNOTSUPP);
goto done;
}
int_port = mlx5e_int_port_lookup(priv, ifindex, type);
if (int_port)
goto done;
/* Alloc and add new int port to list */
int_port = mlx5e_int_port_add(priv, ifindex, type);
done:
mutex_unlock(&priv->int_ports_lock);
return int_port;
}
void
mlx5e_tc_int_port_put(struct mlx5e_tc_int_port_priv *priv,
struct mlx5e_tc_int_port *int_port)
{
if (!refcount_dec_and_mutex_lock(&int_port->refcnt, &priv->int_ports_lock))
return;
mlx5e_int_port_remove(priv, int_port);
mutex_unlock(&priv->int_ports_lock);
}
struct mlx5e_tc_int_port_priv *
mlx5e_tc_int_port_init(struct mlx5e_priv *priv)
{
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
struct mlx5e_tc_int_port_priv *int_port_priv;
u64 mapping_id;
if (!mlx5e_tc_int_port_supported(esw))
return NULL;
int_port_priv = kzalloc(sizeof(*int_port_priv), GFP_KERNEL);
if (!int_port_priv)
return NULL;
mapping_id = mlx5_query_nic_system_image_guid(priv->mdev);
int_port_priv->metadata_mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_INT_PORT,
sizeof(u32) * 2,
(1 << ESW_VPORT_BITS) - 1, true);
if (IS_ERR(int_port_priv->metadata_mapping)) {
mlx5_core_warn(priv->mdev, "Can't allocate metadata mapping of int port offload, err=%ld\n",
PTR_ERR(int_port_priv->metadata_mapping));
goto err_mapping;
}
int_port_priv->dev = priv->mdev;
mutex_init(&int_port_priv->int_ports_lock);
INIT_LIST_HEAD(&int_port_priv->int_ports);
return int_port_priv;
err_mapping:
kfree(int_port_priv);
return NULL;
}
void
mlx5e_tc_int_port_cleanup(struct mlx5e_tc_int_port_priv *priv)
{
if (!priv)
return;
mutex_destroy(&priv->int_ports_lock);
mapping_destroy(priv->metadata_mapping);
kfree(priv);
}
/* Int port rx rules reside in ul rep rx tables.
* It is possible the ul rep will go down while there are
* still int port rules in its rx table so proper cleanup
* is required to free resources.
*/
void mlx5e_tc_int_port_init_rep_rx(struct mlx5e_priv *priv)
{
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
struct mlx5_rep_uplink_priv *uplink_priv;
struct mlx5e_tc_int_port_priv *ppriv;
struct mlx5e_rep_priv *uplink_rpriv;
uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
uplink_priv = &uplink_rpriv->uplink_priv;
ppriv = uplink_priv->int_port_priv;
if (!ppriv)
return;
mutex_lock(&ppriv->int_ports_lock);
ppriv->ul_rep_rx_ready = true;
mutex_unlock(&ppriv->int_ports_lock);
}
void mlx5e_tc_int_port_cleanup_rep_rx(struct mlx5e_priv *priv)
{
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
struct mlx5_rep_uplink_priv *uplink_priv;
struct mlx5e_tc_int_port_priv *ppriv;
struct mlx5e_rep_priv *uplink_rpriv;
struct mlx5e_tc_int_port *int_port;
uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
uplink_priv = &uplink_rpriv->uplink_priv;
ppriv = uplink_priv->int_port_priv;
if (!ppriv)
return;
mutex_lock(&ppriv->int_ports_lock);
ppriv->ul_rep_rx_ready = false;
list_for_each_entry(int_port, &ppriv->int_ports, list) {
if (!IS_ERR_OR_NULL(int_port->rx_rule))
mlx5_del_flow_rules(int_port->rx_rule);
int_port->rx_rule = NULL;
}
mutex_unlock(&ppriv->int_ports_lock);
}
bool
mlx5e_tc_int_port_dev_fwd(struct mlx5e_tc_int_port_priv *priv,
struct sk_buff *skb, u32 int_vport_metadata,
bool *forward_tx)
{
enum mlx5e_tc_int_port_type fwd_type;
struct mlx5e_tc_int_port *int_port;
struct net_device *dev;
int ifindex;
if (!priv)
return false;
rcu_read_lock();
int_port = mlx5e_int_port_get_from_metadata(priv, int_vport_metadata);
if (!int_port) {
rcu_read_unlock();
mlx5_core_dbg(priv->dev, "Unable to find int port with metadata 0x%.8x\n",
int_vport_metadata);
return false;
}
ifindex = int_port->ifindex;
fwd_type = int_port->type;
rcu_read_unlock();
dev = dev_get_by_index(&init_net, ifindex);
if (!dev) {
mlx5_core_dbg(priv->dev,
"Couldn't find internal port device with ifindex: %d\n",
ifindex);
return false;
}
skb->skb_iif = dev->ifindex;
skb->dev = dev;
if (fwd_type == MLX5E_TC_INT_PORT_INGRESS) {
skb->pkt_type = PACKET_HOST;
skb_set_redirected(skb, true);
*forward_tx = false;
} else {
skb_reset_network_header(skb);
skb_push_rcsum(skb, skb->mac_len);
skb_set_redirected(skb, false);
*forward_tx = true;
}
return true;
}