BPF 進階筆記(三):BPF Map 內核實現

語言: CN / TW / HK

關於本文

內核目前支持 30 來種 BPF map 類型。

本文整理一些與這些 map 相關的內核實現。

關於 “BPF 進階筆記” 系列

平時學習使用 BPF 時所整理。由於是筆記而非教程,因此內容不會追求連貫,有基礎的 同學可作查漏補缺之用。

文中涉及的代碼,如無特殊説明,均基於內核 5.8/5.10 版本。

  • 關於 “BPF 進階筆記” 系列
    • BPF map 類型:完整列表
    • 不同 map 類型支持的操作( xx_map_ops ):完整列表
    • struct bpf_map :內核 bpf map 結構體
  • ————————————————————————
  • ————————————————————————
    • struct bpf_htab :內核哈希表
    • struct bucket :哈希槽(鏈表,不存放實際數據)
    • struct htab_elem :存放 hash+key+value 等數據
    • 創建 map: struct bpf_map *htab_map_alloc()
    • 查詢 map: void *htab_map_lookup_elem()
    • 插入或更新 map: int htab_map_update_elem()
    • 獲取下一個 key: int htab_map_get_next_key(map, key, void *next_key)
  • 2 BPF_MAP_TYPE_PERCPU_HASH
  • 3 BPF_MAP_TYPE_LRU_HASH
  • 4 BPF_MAP_TYPE_LRU_PERCPU_HASH
  • 5 BPF_MAP_TYPE_HASH_OF_MAPS
  • ————————————————————————
  • ————————————————————————
    • 創建 map: struct bpf_map *array_map_alloc()
    • 查詢 map: void *array_map_lookup_elem()
  • 2 BPF_MAP_TYPE_PERCPU_ARRAY
  • 3 BPF_MAP_TYPE_PROG_ARRAY
  • 4 BPF_MAP_TYPE_PERF_EVENT_ARRAY
  • 5 BPF_MAP_TYPE_ARRAY_OF_MAPS
  • 6 BPF_MAP_TYPE_CGROUP_ARRAY
  • ————————————————————————
  • ————————————————————————
  • 1 BPF_MAP_TYPE_CGROUP_ARRAY
  • 2 BPF_MAP_TYPE_CGROUP_STORAGE
      • struct bpf_cgroup_storage_map :cgroup storage map 定義
      • struct bpf_cgroup_storage_key :key 定義
    • 創建 map: struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
    • 初始化指定類型的 cgroup storage: struct bpf_cgroup_storage *bpf_cgroup_storage_alloc()
  • 3 BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE
  • ————————————————————————
  • ————————————————————————
  • 1 BPF_MAP_TYPE_STACK_TRACE
  • 3 BPF_MAP_TYPE_RINGBUF
  • 4 BPF_MAP_TYPE_PERF_EVENT_ARRAY
  • ————————————————————————
  • ————————————————————————
  • 1 BPF_MAP_TYPE_SOCKMAP
  • 2 BPF_MAP_TYPE_REUSEPORT_SOCKARRAY
  • 3 BPF_MAP_TYPE_SK_STORAGE
  • ————————————————————————
  • ————————————————————————
  • 1 BPF_MAP_TYPE_SOCKHASH
  • 2 BPF_MAP_TYPE_DEVMAP
  • 3 BPF_MAP_TYPE_DEVMAP_HASH
  • 4 BPF_MAP_TYPE_XSKMAP
  • ————————————————————————
  • ————————————————————————
  • 1 BPF_MAP_TYPE_CPUMAP
  • 3 BPF_MAP_TYPE_STRUCT_OPS
  • 4 BPF_MAP_TYPE_LPM_TRIE

基礎

BPF map 類型:完整列表

所有 map 類型的 定義

// include/uapi/linux/bpf.h

enum bpf_map_type {
    BPF_MAP_TYPE_UNSPEC,
    BPF_MAP_TYPE_HASH,
    BPF_MAP_TYPE_ARRAY,
    BPF_MAP_TYPE_PROG_ARRAY,
    BPF_MAP_TYPE_PERF_EVENT_ARRAY,
    BPF_MAP_TYPE_PERCPU_HASH,
    BPF_MAP_TYPE_PERCPU_ARRAY,
    BPF_MAP_TYPE_STACK_TRACE,
    BPF_MAP_TYPE_CGROUP_ARRAY,
    BPF_MAP_TYPE_LRU_HASH,
    BPF_MAP_TYPE_LRU_PERCPU_HASH,
    BPF_MAP_TYPE_LPM_TRIE,
    BPF_MAP_TYPE_ARRAY_OF_MAPS,
    BPF_MAP_TYPE_HASH_OF_MAPS,
    BPF_MAP_TYPE_DEVMAP,
    BPF_MAP_TYPE_SOCKMAP,
    BPF_MAP_TYPE_CPUMAP,
    BPF_MAP_TYPE_XSKMAP,
    BPF_MAP_TYPE_SOCKHASH,
    BPF_MAP_TYPE_CGROUP_STORAGE,
    BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
    BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
    BPF_MAP_TYPE_QUEUE,
    BPF_MAP_TYPE_STACK,
    BPF_MAP_TYPE_SK_STORAGE,
    BPF_MAP_TYPE_DEVMAP_HASH,
    BPF_MAP_TYPE_STRUCT_OPS,
    BPF_MAP_TYPE_RINGBUF,
};

不同 map 類型支持的操作( xx_map_ops ):完整列表

include/linux/bpf_types.h 中定義了不同類型的 BPF map 所支持的操作:

// include/linux/bpf_types.h

BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY,            array_map_ops)             // kernel/bpf/arraymap.c
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY,     percpu_array_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PROG_ARRAY,       prog_array_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY,     cgroup_array_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE,   cgroup_storage_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_HASH,             htab_map_ops)              // kernel/bpf/hashtab.c
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH,      htab_percpu_map_ops)       // kernel/bpf/hashtab.c
BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH,         htab_lru_map_ops)          // kernel/bpf/hashtab.c
BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_PERCPU_HASH,  htab_lru_percpu_map_ops)   // kernel/bpf/hashtab.c
BPF_MAP_TYPE(BPF_MAP_TYPE_LPM_TRIE,         trie_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE,      stack_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS,    array_of_maps_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS,     htab_of_maps_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP,           dev_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP,          sock_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH,         sock_hash_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP,           cpu_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP,           xsk_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops)

大部分實現都在 kernel/bpf/ 目錄下。

重要結構體

struct bpf_map :內核 bpf map 結構體

調用 map 的 create 方法 創建 map 時 ,返回的是一個 struct bpf_map * 對象, 記錄 map 元數據 ,定義如下:

// include/linux/bpf.h

struct bpf_map {
    // First two cachelines with read-mostly members of which some are also accessed in fast-path (e.g. ops, max_entries).
    const struct bpf_map_ops *ops ____cacheline_aligned; // 增刪查改等操作函數
    struct bpf_map *inner_map_meta;

    void *security;

    // 常規 map 屬性
    enum bpf_map_type map_type;
    u32 key_size;
    u32 value_size;
    u32 max_entries;
    u32 map_flags;

    int    spin_lock_off; /* >=0 valid offset, <0 error */
    u32    id;
    int    numa_node;
    u32    btf_key_type_id;
    u32    btf_value_type_id;
    struct btf *btf;
    struct bpf_map_memory memory;
    char   name[BPF_OBJ_NAME_LEN];
    u32    btf_vmlinux_value_type_id;
    bool   bypass_spec_v1;
    bool   frozen; /* write-once; write-protected by freeze_mutex */
    /* 22 bytes hole */

    // The 3rd and 4th cacheline with misc members to avoid false sharing particularly with refcounting.
    atomic64_t refcnt ____cacheline_aligned; // 引用計數
    atomic64_t usercnt;                      // 引用計數
    struct work_struct work;
    struct mutex freeze_mutex;
    u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */
};

其中尤其重要的幾個字段:

  1. refcnt :引用計數, 很多類型的 map 是依據 refcnt 是否等於 0 來執行 map GC 的 。更多信息,參考 (譯) BPF 對象(BPF objects)的生命週期(Facebook,2018)

————————————————————————

Hash Maps

————————————————————————

Hash map 的實現見 kernel/bpf/hashtab.c 。 五種類型共用一套代碼:

BPF_MAP_TYPE_HASH
BPF_MAP_TYPE_PERCPU_HASH
BPF_MAP_TYPE_LRU_HASH
BPF_MAP_TYPE_LRU_PERCPU_HASH
BPF_MAP_TYPE_HASH_OF_MAPS

數據結構設計

下圖是 BPF hash map 如何組織的

BPF Hash Map    

  +--------------------+
  | struct bpf_map map |--> general BPF map (metadata)
  |--------------------|
  | struct bucket *    |--> bucket linked-list
  |--------------------|
  | void *elems        |--> elements (hash+key+value), link-listed
  |--------------------|
  |                    |
  |--------------------|
  | count, n_buckets,  |--> hash map metadata
  | elem_size, hashrnd |
  +--------------------+
  1. 內核表示 BPF map 的結構體 struct bpf_map 不區 map 分類型 的,因此 hash map 在 BPF map 之上又封裝了一層 ,即 struct bpf_htab 表示一個內核 hash map

  2. Hash map 又主要分為兩部分:

    1. Buckets:對 key 進行哈希之後找到對應的 buckets,但這裏存放的只是 buckets 鏈表和鎖等元數據, 不存放數據
    2. Elements:即 真正需要存放的數據 ,也組織成鏈表。

重要結構體

struct bpf_htab :內核哈希表

前面已經提到,內核 bpf map struct bpf_map 記錄的只是通用 map 元數據(不區分 map 類型)。 對於 hash map 來説,真正的數據是存儲在哈希表 struct bpf_htab 裏:

// kernel/bpf/hashtab.c

// 哈希表(hash table)數據結構
struct bpf_htab {
    struct bpf_map map;      // 這個 hash map 對應的 BPF map(元數據)

    struct bucket *buckets;  // 鏈表 + 鎖
    void *elems;             // 每個 elem 的結構:struct htab_elem + key + value
    union {
        struct pcpu_freelist freelist;
        struct bpf_lru lru;
    };
    struct htab_elem *__percpu *extra_elems;

    atomic_t count;          // element 數量
    u32 n_buckets;           // buckets 數量
    u32 elem_size;           // element 大小,單位 bytes
    u32 hashrnd;             // 哈希隨機數
};

從定義可以看出,這個哈希表支持前面提到個所有 hash map 類型(percpu、lru 等等)。

另外,這個結構體裏有 三段內存區域 ,分別存儲 bucket 信息、 elements(hash+key+value 等)信息和額外的 elements 信息:

+--+--+--+--+--+----------+
buckets      |  |  |  |  |  | ...      |
             +--+--+--+--+--+----------+
             +-------+--------+--------+------------------------------+
elems        |       |        |        | ...                          |
             +-------+--------+--------+------------------------------+
             +-------+--------+-----------------+
extra_elemes |       |        | ...             | 只有 8 個空間,普通 hash map 用;PERCPU、LRU map 不用。
             +-------+--------+-----------------+

struct bucket :哈希槽(鏈表,不存放實際數據)

Bucket 存放的只是鏈表和鎖, 並不存放實際數據

// kernel/bpf/hashtab.c

struct bucket {
    struct hlist_nulls_head head;
    union {
        raw_spinlock_t raw_lock;
        spinlock_t     lock;
    };
};

關於這個結構體的用途,內核中有超長的註釋,見源碼。

struct htab_elem :存放 hash+key+value 等數據

存放實際的 map 數據:

// kernel/bpf/hashtab.c

struct htab_elem {
    // 不同 hash map 類型特定的字段
    union {
        struct hlist_nulls_node hash_node; // 內核通用鏈表類型之一 hlist_nulls
        struct {
            void *padding;
            union {
                struct bpf_htab *htab;
                struct pcpu_freelist_node fnode;
                struct htab_elem *batch_flink;
            };
        };
    };
    union {
        struct rcu_head rcu;
        struct bpf_lru_node lru_node;
    };

    // 公共字段
    u32 hash;                 // 哈希值
    char key[] __aligned(8);  // 指針,指向接下來的 key+value 數據
};

這個結構體也包含了一個鏈表結構,但注意最後兩個字段:

  1. u32 hash :這是 對 key 進行哈希之後得到的哈希值
  2. char key[] :這是一個指針,在初始化時會 指向 key+value 的連續內存區域

1 BPF_MAP_TYPE_HASH

幾種不同類型的 hash map 都是在同一套代碼中實現的。這裏看一下其中最簡單的一種:hash map。

首先, 註冊的操作方法 (回調函數)集合見 kernel/bpf/hashtab.c

// kernel/bpf/hashtab.c

const struct bpf_map_ops htab_map_ops = {
    .map_alloc_check   = htab_map_alloc_check,
    .map_alloc         = htab_map_alloc,
    .map_free          = htab_map_free,
    .map_get_next_key  = htab_map_get_next_key,
    .map_lookup_elem   = htab_map_lookup_elem,   // 查找
    .map_update_elem   = htab_map_update_elem,   // 創建或更新
    .map_delete_elem   = htab_map_delete_elem,   // 刪除
    .map_gen_lookup    = htab_map_gen_lookup,
    .map_seq_show_elem = htab_map_seq_show_elem,
    BATCH_OPS(htab),
};

創建 map: struct bpf_map *htab_map_alloc()

函數簽名:

// kernel/bpf/hashtab.c

static struct bpf_map *htab_map_alloc(union bpf_attr *attr)

實現在 kernel/bpf/hashtab.c 調用棧

htab_map_alloc                                   # kernel/bpf/hashtab.c
  |-htab = kzalloc(sizeof(*htab), GFP_USER)
  |-bpf_map_init_from_attr(&htab->map, attr)     # 初始化 map 元數據;htab->map 就是 struct bpf_map
  |
  |-htab->n_buckets = ...
  |-htab->elem_size = ...
  |-bpf_map_charge_init(&htab->map.memory, cost) # 確保 map size 不會過大
  |
  | # 分配 bucket 內存
  |-htab->buckets = bpf_map_area_alloc(size, numa_node)                        // kernel/bpf/syscall.c
  |                  |-__bpf_map_area_alloc(size, numa_node, false)
  |                     |-if condition
  |                     |   kmalloc_node(GFP_USER)
  |                     |-else
  |                         __vmalloc_node_range(GFP_KERNEL)
  |
  |-htab->hashrnd = ...                          # 初始化哈希種子
  |
  |-if (prealloc) { # 提前為所有 elements 分配內存
      prealloc_init(htab);
        |-htab->elems = bpf_map_area_alloc(elem_size*n_entries, numa_node);
        |-per-cpu and lru initiazations if needed

      # 分配 extra 內存
      if (!percpu && !lru)       // lru itself can remove the least used element, so
        alloc_extra_elems(htab); // there is no need for an extra elem during map_update
          |-__alloc_percpu_gfp(sizeof(struct htab_elem *), 8, GFP_USER);
    }

前面介紹 struct bpf_htab 結構體時已經提到,這裏有 三段內存 需要分配:

  1. buckets 空間:注意這裏的 buckets 並不是用來存 elements 的,只是一 個鏈表,裏面存放了鏈表、鎖等簡單數據;
  2. 如果啟用了預分配,為一次性為所有 elements 分配內存空間。

    預分配的好處是性能更高,因為不需要為每個 element 動態分配內存;壞處也顯而易 見,就是 一上來就會佔用掉這個 map 能佔用的最大內存 ,即使這時 map 還是空的。

    舉例:Cilium 有一個 --preallocate-bpf-maps='false' 選項,默認是 false。

  3. extra_elems

查詢 map: void *htab_map_lookup_elem()

邏輯:

  1. 根據 key 計算出一個哈希值 hash
  2. hash 作為 數組索引 ,直接定位到對應的 bucket( O(1) ),
  3. 順序遍歷 bucket 內的 struct htab_elem 元素,如果 hashkey 都相同,就返回對應的 value 其實地址;否則返回空。這裏雖然是順序遍歷,但 除非有哈希衝突,否則第一次就返回了

下面看實現。

傳入參數是 bpf map 和 key, 返回的是 value 的起始地址

// kernel/bpf/hashtab.c

// 返回 value 起始地址
static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
{
    struct htab_elem *l = __htab_map_lookup_elem(map, key);
    if (l)
        // l->key 指向的是 key+value 的地址,因此 l->key + key_size 指向的才是 value,
        // 此外還需要考慮 key_size 對其 8 字節,因此得到下面一行代碼:
        return l->key + round_up(map->key_size, 8);

    return NULL;
}

static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
{
    struct bpf_htab *htab = container_of(map, struct bpf_htab, map);

    hash = htab_map_hash(key, key_size, htab->hashrnd);
    head = select_bucket(htab, hash); // 以 hash 作為數組索引,直接定位到 bucket 起始地址,返回 bucket 內的鏈表頭指針
                                      // 簡化之後:head = htab->buckets[hash]->head
    return lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
}

// can be called without bucket lock. it will repeat the loop in
// the unlikely event when elements moved from one bucket into another while link list is being walked
static struct htab_elem *
lookup_nulls_elem_raw(struct hlist_nulls_head *head, u32 hash, void *key, u32 key_size, u32 n_buckets)
{
    struct hlist_nulls_node *n; // 通用鏈表類型之一,與普通鏈表的區別是最後一個元素不是 NULL 指針,而是 'nulls' 元素
    struct htab_elem *l;

again:
    // 順序遍歷 head 指向的(bucket)鏈表。
    hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
       if (l->hash == hash && !memcmp(&l->key, key, key_size)) // 哈希值和 key 都相同
           return l;
    // 為便於理解,以上代碼展開和簡化之後等價於下面的(偽)代碼
    // for (n=head; n!=nulls && l=(struct htab_elem *)n->hash_node; n=n->next)
    //     if (l->hash == hash && !memcmp(&l->key, key, key_size)) // 哈希值和 key 都相同
    //         return l;

    if (get_nulls_value(n) != (hash & (n_buckets - 1)))
        goto again;

    return NULL;
}

插入或更新 map: int htab_map_update_elem()

插入和更新都是執行這個函數,返回值:

調用棧

htab_map_update_elem
 |-hash = htab_map_hash(key, key_size, htab->hashrnd)
 |-b = __select_bucket(htab, hash)
 |-l_old = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets)
 |
 |-if l_old
 |   copy_map_value_locked(map, l_old->key + round_up(key_size, 8), value, false)
 |   return 0
 |
 |-l_old = lookup_elem_raw(head, hash, key, key_size);
 |-check_flags(htab, l_old, map_flags);
 |-l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, l_old);
 |          |-if prealloc
 |          |   ...
 |          |-else
 |          |   atomic_inc_return(&htab->count)
 |          |   l_new = kmalloc_node()
 |          |
 |          |-memcpy(l_new->key, key, key_size);
 |          |-copy_map_value(&htab->map, l_new->key + round_up(key_size, 8), value);
 |          |-l_new->hash = hash;
 |          |-return l_new
 |
 |-hlist_nulls_add_head_rcu(&l_new->hash_node, head);
 |
 |-if l_old
 |    hlist_nulls_del_rcu(&l_old->hash_node);
 |    if (!htab_is_prealloc(htab))
 |       free_htab_elem(htab, l_old);
 |-return 0

實現

// kernel/bpf/hashtab.c

/* Called from syscall or from eBPF program */
static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags)
{
    struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
    struct htab_elem *l_new = NULL, *l_old;

    key_size = map->key_size;
    hash = htab_map_hash(key, key_size, htab->hashrnd); // 根據 key 計算出一個哈希值

    struct bucket *b = __select_bucket(htab, hash);
    head = &b->head;

    if (unlikely(map_flags & BPF_F_LOCK)) {
        /* find an element without taking the bucket lock */
        l_old = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
        ret = check_flags(htab, l_old, map_flags);
        if (ret)
            return ret;

        if (l_old) { // 如果已經存在:獲取 elem lock,然後原地更新 value
            copy_map_value_locked(map, l_old->key + round_up(key_size, 8), value, false);
            return 0;
        }
    }

    // 至此,確認老記錄不存在。
    // 接下來:獲取 bucket lock 然後再查詢一次。99.9% 的概率仍然是查不到,但這一步必須做。
    flags = htab_lock_bucket(htab, b);
    l_old = lookup_elem_raw(head, hash, key, key_size);
    check_flags(htab, l_old, map_flags);

    if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
        // first lookup without the bucket lock didn't find the element,
        // but second lookup with the bucket lock found it.
        // This case is highly unlikely, but has to be dealt with:
        // grab the element lock in addition to the bucket lock and update element in place
        copy_map_value_locked(map, l_old->key + round_up(key_size, 8), value, false);
        ret = 0;
        goto err;
    }

    // 分配新 element。內部:根據 map 是否是 prealloc 模式,處理邏輯會有所不同
    l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, l_old);

    // 插到鏈表頭,so that concurrent search will find it before old elem */
    hlist_nulls_add_head_rcu(&l_new->hash_node, head);

    // 如果老的還在,刪掉
    if (l_old) {
        hlist_nulls_del_rcu(&l_old->hash_node);
        if (!htab_is_prealloc(htab))
            free_htab_elem(htab, l_old);
    }
    ret = 0;

err:
    htab_unlock_bucket(htab, b, flags);
    return ret;
}

static struct htab_elem *
alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash,
                     bool percpu, bool onallcpus, struct htab_elem *old_elem)
{
    u32 size = htab->map.value_size;
    bool prealloc = htab_is_prealloc(htab);
    struct htab_elem *l_new, **pl_new;
    void __percpu *pptr;

    if (prealloc) {
        if (old_elem) {
            pl_new = this_cpu_ptr(htab->extra_elems);
            l_new = *pl_new;
            htab_put_fd_value(htab, old_elem);
            *pl_new = old_elem;
        } else {
            struct pcpu_freelist_node *l;

            l = __pcpu_freelist_pop(&htab->freelist);
            l_new = container_of(l, struct htab_elem, fnode);
        }
    } else {
        if (atomic_inc_return(&htab->count) > htab->map.max_entries)
            if (!old_elem) {
                l_new = ERR_PTR(-E2BIG);
                goto dec_count;
            }
        l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC, htab->map.numa_node);
        check_and_init_map_lock(&htab->map, l_new->key + round_up(key_size, 8));
    }

    memcpy(l_new->key, key, key_size);

    if (percpu) {
        ...
    } else if (fd_htab_map_needs_adjust(htab)) {
        size = round_up(size, 8);
        memcpy(l_new->key + round_up(key_size, 8), value, size);
    } else {
        copy_map_value(&htab->map, l_new->key + round_up(key_size, 8), value);
    }

    l_new->hash = hash;
    return l_new;
dec_count:
    atomic_dec(&htab->count);
    return l_new;
}

獲取下一個 key: int htab_map_get_next_key(map, key, void *next_key)

為什麼要有獲取下一個 key 的方法?

邏輯

獲取非第一個 key:

  1. 根據 key 計算一個哈希值 hash
  2. hash 作為數組索引,直接定位到對應的 bucket;
  3. 在該 bucket 內查找給定 key;如果找到,按順序返回下一個 key。

    這裏需要注意的是, 下一個 key 可能在當前 bucket,也可能在下一個 bucket

獲取第一個 key:沒有別的辦法,只能順序遍歷 buckets 及每個 buckets 內的各元素。

// kernel/bpf/hashtab.c

/* Called from syscall */
static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
    struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
    struct htab_elem *l, *next_l;

    key_size = map->key_size;

    if (!key)
        goto find_first_elem;

    hash = htab_map_hash(key, key_size, htab->hashrnd);
    head = select_bucket(htab, hash);

    /* lookup the key */
    l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
    if (!l)
        goto find_first_elem;

    /* key was found, get next key in the same bucket */
    next_l = hlist_nulls_entry_safe(hlist_nulls_next_rcu(&l->hash_node), struct htab_elem, hash_node);
    if (next_l) { // if next elem in this hash list is non-zero, just return it
        memcpy(next_key, next_l->key, key_size);
        return 0;
    }

    /* no more elements in this hash list, go to the next bucket */
    i = hash & (htab->n_buckets - 1);
    i++;

find_first_elem:
    for (; i < htab->n_buckets; i++) { // iterate over buckets
        head = select_bucket(htab, i);

        /* pick first element in the bucket */
        next_l = hlist_nulls_entry_safe(hlist_nulls_first_rcu(head), struct htab_elem, hash_node);
        if (next_l) { // if it's not empty, just return it
            memcpy(next_key, next_l->key, key_size);
            return 0;
        }
    }

    /* iterated over all buckets and all elements */
    return -ENOENT;
}

2 BPF_MAP_TYPE_PERCPU_HASH

3 BPF_MAP_TYPE_LRU_HASH

為每個 bucket 維護一個 LRU list ,bucket 滿了之後刪除最老的。

4 BPF_MAP_TYPE_LRU_PERCPU_HASH

5 BPF_MAP_TYPE_HASH_OF_MAPS

————————————————————————

Array Maps

————————————————————————

類型:

BPF_MAP_TYPE_ARRAY
BPF_MAP_TYPE_PERCPU_ARRAY
BPF_MAP_TYPE_PROG_ARRAY
BPF_MAP_TYPE_PERF_EVENT_ARRAY
BPF_MAP_TYPE_ARRAY_OF_MAPS
BPF_MAP_TYPE_CGROUP_ARRAY

都是在 kernel/bpf/arraymap.c 中實現的。

方法:

// kernel/bpf/arraymap.c

const struct bpf_map_ops array_map_ops = {
    .map_alloc_check       = array_map_alloc_check,
    .map_alloc             = array_map_alloc,
    .map_free              = array_map_free,
    .map_get_next_key      = array_map_get_next_key,
    .map_lookup_elem       = array_map_lookup_elem,
    .map_update_elem       = array_map_update_elem,
    .map_delete_elem       = array_map_delete_elem,
    .map_gen_lookup        = array_map_gen_lookup,
    .map_direct_value_addr = array_map_direct_value_addr,
    .map_direct_value_meta = array_map_direct_value_meta,
    .map_mmap              = array_map_mmap,
    .map_seq_show_elem     = array_map_seq_show_elem,
    .map_check_btf         = array_map_check_btf,
    .map_lookup_batch      = generic_map_lookup_batch,
    .map_update_batch      = generic_map_update_batch,
};

1 BPF_MAP_TYPE_ARRAY

相比於 hash map,array map 的實現就簡單多了 ,因為 key 就是數組 index ,所以map 的增刪查改都是數組操作。

創建 map: struct bpf_map *array_map_alloc()

// kernel/bpf/arraymap.c

static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
    bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
    struct bpf_map_memory mem;
    struct bpf_array *array;

    elem_size = round_up(attr->value_size, 8);
    max_entries = attr->max_entries;
    array_size = sizeof(*array); // 內存要 page-aligned

    bpf_map_charge_init(&mem, cost); // 確保內存不會溢出

    /* allocate all map elements and zero-initialize them */
    if (attr->map_flags & BPF_F_MMAPABLE) {
        /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */
        void *data = bpf_map_area_mmapable_alloc(array_size, numa_node);
        array = data + PAGE_ALIGN(sizeof(struct bpf_array)) - offsetof(struct bpf_array, value);
    } else {
        array = bpf_map_area_alloc(array_size, numa_node);
    }

    array->index_mask = index_mask;

    /* copy mandatory map attributes */
    bpf_map_init_from_attr(&array->map, attr);
    bpf_map_charge_move(&array->map.memory, &mem);
    array->elem_size = elem_size;

    return &array->map;
}

查詢 map: void *array_map_lookup_elem()

// kernel/bpf/arraymap.c

/* Called from syscall or from eBPF program */
static void *array_map_lookup_elem(struct bpf_map *map, void *key)
{
    struct bpf_array *array = container_of(map, struct bpf_array, map);
    u32 index = *(u32 *)key; // 直接將 key 轉換成 index

    if (unlikely(index >= array->map.max_entries))
        return NULL;

    return array->value + array->elem_size * (index & array->index_mask);
}

2 BPF_MAP_TYPE_PERCPU_ARRAY

3 BPF_MAP_TYPE_PROG_ARRAY

這種類型的 array 存放的是 BPF 程序的文件描述符(fd),在尾調用時使用。

// kernel/bpf/arraymap.c

const struct bpf_map_ops prog_array_map_ops = {
    .map_alloc_check = fd_array_map_alloc_check,
    .map_alloc = prog_array_map_alloc,
    .map_free = prog_array_map_free,
    .map_poke_track = prog_array_map_poke_track,
    .map_poke_untrack = prog_array_map_poke_untrack,
    .map_poke_run = prog_array_map_poke_run,
    .map_get_next_key = array_map_get_next_key,
    .map_lookup_elem = fd_array_map_lookup_elem,
    .map_delete_elem = fd_array_map_delete_elem,
    .map_fd_get_ptr = prog_fd_array_get_ptr,
    .map_fd_put_ptr = prog_fd_array_put_ptr,
    .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
    .map_release_uref = prog_array_map_clear,
    .map_seq_show_elem = prog_array_map_seq_show_elem,
};

4 BPF_MAP_TYPE_PERF_EVENT_ARRAY

5 BPF_MAP_TYPE_ARRAY_OF_MAPS

6 BPF_MAP_TYPE_CGROUP_ARRAY

// kernel/bpf/arraymap.c

const struct bpf_map_ops cgroup_array_map_ops = {
    .map_alloc_check = fd_array_map_alloc_check,
    .map_alloc = array_map_alloc,
    .map_free = cgroup_fd_array_free,
    .map_get_next_key = array_map_get_next_key,
    .map_lookup_elem = fd_array_map_lookup_elem,
    .map_delete_elem = fd_array_map_delete_elem,
    .map_fd_get_ptr = cgroup_fd_array_get_ptr,
    .map_fd_put_ptr = cgroup_fd_array_put_ptr,
    .map_check_btf = map_check_no_btf,
};

————————————————————————

CGroup Maps

————————————————————————

管理 attach 到 cgroup 的 BPF 程序。

CGroup 相關結構體

// kernel/bpf/cgroup.c

const struct bpf_prog_ops cg_dev_prog_ops = {
};

const struct bpf_verifier_ops cg_dev_verifier_ops = {
    .get_func_proto     = cgroup_dev_func_proto,
    .is_valid_access    = cgroup_dev_is_valid_access,
};

const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
    .get_func_proto     = sysctl_func_proto,
    .is_valid_access    = sysctl_is_valid_access,
    .convert_ctx_access = sysctl_convert_ctx_access,
};

const struct bpf_prog_ops cg_sysctl_prog_ops = {
};

const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
    .get_func_proto     = cg_sockopt_func_proto,
    .is_valid_access    = cg_sockopt_is_valid_access,
    .convert_ctx_access = cg_sockopt_convert_ctx_access,
    .gen_prologue       = cg_sockopt_get_prologue,
};

const struct bpf_prog_ops cg_sockopt_prog_ops = {
};

1 BPF_MAP_TYPE_CGROUP_ARRAY

見上面。

2 BPF_MAP_TYPE_CGROUP_STORAGE

// kernel/bpf/local_storage.c

const struct bpf_map_ops cgroup_storage_map_ops = {
    .map_alloc         = cgroup_storage_map_alloc,
    .map_free          = cgroup_storage_map_free,
    .map_get_next_key  = cgroup_storage_get_next_key,
    .map_lookup_elem   = cgroup_storage_lookup_elem,
    .map_update_elem   = cgroup_storage_update_elem,
    .map_delete_elem   = cgroup_storage_delete_elem,
    .map_check_btf     = cgroup_storage_check_btf,
    .map_seq_show_elem = cgroup_storage_seq_show_elem,
};

結構體

struct bpf_cgroup_storage_map :cgroup storage map 定義

與 hash map 類型,這裏定義了一個新結構體來表示 cgroup storage map,這個結構體 將標準 bpf map 嵌套到結構體中作為一個字段

// kernel/bpf/local_storage.c

struct bpf_cgroup_storage_map {
    struct bpf_map map;            // 標準的 bpf map

    spinlock_t               lock;
    struct     bpf_prog_aux *aux;
    struct     rb_root       root;
    struct     list_head     list;
};

struct bpf_cgroup_storage_key :key 定義

這種類型的 map, key 定義為

// include/uapi/linux/bpf.h

struct bpf_cgroup_storage_key {
    __u64    cgroup_inode_id;    /* cgroup inode id */
    __u32    attach_type;        /* program attach type */
};

其他結構體

// include/linux/bpf-cgroup.h

struct bpf_storage_buffer {
    struct rcu_head rcu;
    char data[];
};

struct bpf_cgroup_storage {
    union {
        struct bpf_storage_buffer *buf;
        void __percpu *percpu_buf;
    };

    struct bpf_cgroup_storage_map *map;
    struct bpf_cgroup_storage_key key;

    struct list_head list;
    struct rb_node node;
    struct rcu_head rcu;
};

struct bpf_cgroup_link {
    struct bpf_link link;
    struct cgroup *cgroup;
    enum bpf_attach_type type;
};

// Attach 到每個 cgroup 的 BPF 程序組成一個鏈表
struct bpf_prog_list {
    struct list_head node;
    struct bpf_prog *prog;
    struct bpf_cgroup_link *link;

    struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
};

struct cgroup_bpf {
    // 這個 cgroup 內的有效 BPF 程序
    struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE];

    // attach 到這個 cgroup 的 BPF 程序
    struct list_head progs[MAX_BPF_ATTACH_TYPE];
    // attach flags
    // * flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will have either zero or one element
    // * BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
    u32 flags[MAX_BPF_ATTACH_TYPE];

    /* temp storage for effective prog array used by prog_attach/detach */
    struct bpf_prog_array *inactive;

    /* reference counter used to detach bpf programs after cgroup removal */
    struct percpu_ref refcnt;

    /* cgroup_bpf is released using a work queue */
    struct work_struct release_work;
};

創建 map: struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)

// kernel/bpf/local_storage.c

static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
{
    int numa_node = bpf_map_attr_numa_node(attr);
    struct bpf_cgroup_storage_map *map;
    struct bpf_map_memory mem;

    if (attr->max_entries) /* max_entries is not used and enforced to be 0 */
        return ERR_PTR(-EINVAL);

    bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map));

    map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), __GFP_ZERO | GFP_USER, numa_node);
    bpf_map_charge_move(&map->map.memory, &mem);

    bpf_map_init_from_attr(&map->map, attr); // 初始化(複製)map 元數據字段

    map->root = RB_ROOT;
    INIT_LIST_HEAD(&map->list);

    return &map->map;
}

初始化指定類型的 cgroup storage: struct bpf_cgroup_storage *bpf_cgroup_storage_alloc()

一個 cgroup 內的所有 BPF 程序 組織成一個鏈表 struct bpf_prog_list ,這些程序 共用一組 cgroup storage (按類型組織成數 組)。在將 BPF 程序 attach 到 cgroup 時,會有如下的調用棧:

__cgroup_bpf_attach
  |-bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog)
      |-for_each_cgroup_storage_type(stype) {
          storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
        }
// kernel/bpf/local_storage.c

struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, enum bpf_cgroup_storage_type stype)
{
    struct bpf_cgroup_storage *storage;
    struct bpf_map *map;

    map = prog->aux->cgroup_storage[stype];
    size = bpf_cgroup_storage_calculate_size(map, &pages);

    if (bpf_map_charge_memlock(map, pages))
        return ERR_PTR(-EPERM);

    storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), __GFP_ZERO | GFP_USER, map->numa_node);

    flags = __GFP_ZERO | GFP_USER;

    if (stype == BPF_CGROUP_STORAGE_SHARED) {
        storage->buf = kmalloc_node(size, flags, map->numa_node);
        check_and_init_map_lock(map, storage->buf->data);
    } else {
        storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
    }

    storage->map = (struct bpf_cgroup_storage_map *)map;
    return storage;
}

3 BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE

————————————————————————

Tracing Maps

————————————————————————

1 BPF_MAP_TYPE_STACK_TRACE

// kernel/bpf/stackmap.c

const struct bpf_map_ops stack_trace_map_ops = {
    .map_alloc = stack_map_alloc,
    .map_free = stack_map_free,
    .map_get_next_key = stack_map_get_next_key,
    .map_lookup_elem = stack_map_lookup_elem,
    .map_update_elem = stack_map_update_elem,
    .map_delete_elem = stack_map_delete_elem,
    .map_check_btf = map_check_no_btf,
};

2 BPF_MAP_TYPE_STACK

// kernel/bpf/queue_stack_maps.c

const struct bpf_map_ops stack_map_ops = {
    .map_alloc_check = queue_stack_map_alloc_check,
    .map_alloc = queue_stack_map_alloc,
    .map_free = queue_stack_map_free,
    .map_lookup_elem = queue_stack_map_lookup_elem,
    .map_update_elem = queue_stack_map_update_elem,
    .map_delete_elem = queue_stack_map_delete_elem,
    .map_push_elem = queue_stack_map_push_elem,
    .map_pop_elem = stack_map_pop_elem,
    .map_peek_elem = stack_map_peek_elem,
    .map_get_next_key = queue_stack_map_get_next_key,
};

3 BPF_MAP_TYPE_RINGBUF

// kernel/bpf/ringbuf.c

const struct bpf_map_ops ringbuf_map_ops = {
    .map_alloc = ringbuf_map_alloc,
    .map_free = ringbuf_map_free,
    .map_mmap = ringbuf_map_mmap,
    .map_poll = ringbuf_map_poll,
    .map_lookup_elem = ringbuf_map_lookup_elem,
    .map_update_elem = ringbuf_map_update_elem,
    .map_delete_elem = ringbuf_map_delete_elem,
    .map_get_next_key = ringbuf_map_get_next_key,
};

4 BPF_MAP_TYPE_PERF_EVENT_ARRAY

見。

————————————————————————

Socket Maps

————————————————————————

1 BPF_MAP_TYPE_SOCKMAP

前面的都是實現在 kernel/bpf/ 目錄下,而 這個的實現是 net/core/sock_map.c

const struct bpf_map_ops sock_map_ops = {
    .map_alloc                = sock_map_alloc,
    .map_free                 = sock_map_free,
    .map_get_next_key         = sock_map_get_next_key,
    .map_lookup_elem_sys_only = sock_map_lookup_sys,
    .map_update_elem          = sock_map_update_elem,
    .map_delete_elem          = sock_map_delete_elem,
    .map_lookup_elem          = sock_map_lookup,
    .map_release_uref         = sock_map_release_progs,
    .map_check_btf            = map_check_no_btf,
};

2 BPF_MAP_TYPE_REUSEPORT_SOCKARRAY

// kernel/bpf/reuseport_array.c

const struct bpf_map_ops reuseport_array_ops = {
    .map_alloc_check = reuseport_array_alloc_check,
    .map_alloc = reuseport_array_alloc,
    .map_free = reuseport_array_free,
    .map_lookup_elem = reuseport_array_lookup_elem,
    .map_get_next_key = reuseport_array_get_next_key,
    .map_delete_elem = reuseport_array_delete_elem,
};

配合 BPF_PROG_TYPE_SK_REUSEPORT 類型的 BPF 程序使用, 加速 socket 查找

3 BPF_MAP_TYPE_SK_STORAGE

————————————————————————

XDP Maps

————————————————————————

1 BPF_MAP_TYPE_SOCKHASH

前面的都是實現在 kernel/bpf/ 目錄下,而 這個的實現是 net/core/sock_map.c

const struct bpf_map_ops sock_hash_ops = {
    .map_alloc                = sock_hash_alloc,
    .map_free                 = sock_hash_free,
    .map_get_next_key         = sock_hash_get_next_key,
    .map_update_elem          = sock_hash_update_elem,
    .map_delete_elem          = sock_hash_delete_elem,
    .map_lookup_elem          = sock_hash_lookup,
    .map_lookup_elem_sys_only = sock_hash_lookup_sys,
    .map_release_uref         = sock_hash_release_progs,
    .map_check_btf            = map_check_no_btf,
};

2 BPF_MAP_TYPE_DEVMAP

kernel/bpf/devmap.c

功能與 sockmap 類似,但用於 XDP 場景 ,在 bpf_redirect() 時觸發。

3 BPF_MAP_TYPE_DEVMAP_HASH

kernel/bpf/devmap.c

const struct bpf_map_ops dev_map_hash_ops = {
    .map_alloc = dev_map_alloc,
    .map_free = dev_map_free,
    .map_get_next_key = dev_map_hash_get_next_key,
    .map_lookup_elem = dev_map_hash_lookup_elem,
    .map_update_elem = dev_map_hash_update_elem,
    .map_delete_elem = dev_map_hash_delete_elem,
    .map_check_btf = map_check_no_btf,
};

4 BPF_MAP_TYPE_XSKMAP

XDP.

————————————————————————

其他 Maps

————————————————————————

1 BPF_MAP_TYPE_CPUMAP

2 BPF_MAP_TYPE_QUEUE

// kernel/bpf/queue_stack_maps.c

const struct bpf_map_ops queue_map_ops = {
    .map_alloc_check = queue_stack_map_alloc_check,
    .map_alloc = queue_stack_map_alloc,
    .map_free = queue_stack_map_free,
    .map_lookup_elem = queue_stack_map_lookup_elem,
    .map_update_elem = queue_stack_map_update_elem,
    .map_delete_elem = queue_stack_map_delete_elem,
    .map_push_elem = queue_stack_map_push_elem,
    .map_pop_elem = queue_map_pop_elem,
    .map_peek_elem = queue_map_peek_elem,
    .map_get_next_key = queue_stack_map_get_next_key,
};

3 BPF_MAP_TYPE_STRUCT_OPS

4 BPF_MAP_TYPE_LPM_TRIE

// kernel/bpf/lpm_trie.c

const struct bpf_map_ops trie_map_ops = {
    .map_alloc = trie_alloc,
    .map_free = trie_free,
    .map_get_next_key = trie_get_next_key,
    .map_lookup_elem = trie_lookup_elem,
    .map_update_elem = trie_update_elem,
    .map_delete_elem = trie_delete_elem,
    .map_check_btf = trie_check_btf,
};

« BPF 進階筆記(二):BPF Map 類型詳解:使用場景、程序示例