// SPDX-License-Identifier: GPL-2.0 #include #include #include #include #include "subvolume_metrics.h" #include "mds_client.h" #include "super.h" /** * struct ceph_subvol_metric_rb_entry - Per-subvolume I/O metrics node * @node: Red-black tree linkage for tracker->tree * @subvolume_id: Subvolume identifier (key for rb-tree lookup) * @read_ops: Accumulated read operation count since last snapshot * @write_ops: Accumulated write operation count since last snapshot * @read_bytes: Accumulated bytes read since last snapshot * @write_bytes: Accumulated bytes written since last snapshot * @read_latency_us: Sum of read latencies in microseconds * @write_latency_us: Sum of write latencies in microseconds */ struct ceph_subvol_metric_rb_entry { struct rb_node node; u64 subvolume_id; u64 read_ops; u64 write_ops; u64 read_bytes; u64 write_bytes; u64 read_latency_us; u64 write_latency_us; }; static struct kmem_cache *ceph_subvol_metric_entry_cachep; void ceph_subvolume_metrics_init(struct ceph_subvolume_metrics_tracker *tracker) { spin_lock_init(&tracker->lock); tracker->tree = RB_ROOT_CACHED; tracker->nr_entries = 0; tracker->enabled = false; atomic64_set(&tracker->snapshot_attempts, 0); atomic64_set(&tracker->snapshot_empty, 0); atomic64_set(&tracker->snapshot_failures, 0); atomic64_set(&tracker->record_calls, 0); atomic64_set(&tracker->record_disabled, 0); atomic64_set(&tracker->record_no_subvol, 0); atomic64_set(&tracker->total_read_ops, 0); atomic64_set(&tracker->total_read_bytes, 0); atomic64_set(&tracker->total_write_ops, 0); atomic64_set(&tracker->total_write_bytes, 0); } static struct ceph_subvol_metric_rb_entry * __lookup_entry(struct ceph_subvolume_metrics_tracker *tracker, u64 subvol_id) { struct rb_node *node; node = tracker->tree.rb_root.rb_node; while (node) { struct ceph_subvol_metric_rb_entry *entry = rb_entry(node, struct ceph_subvol_metric_rb_entry, node); if (subvol_id < entry->subvolume_id) node = node->rb_left; else if (subvol_id > entry->subvolume_id) node = node->rb_right; else return entry; } return NULL; } static struct ceph_subvol_metric_rb_entry * __insert_entry(struct ceph_subvolume_metrics_tracker *tracker, struct ceph_subvol_metric_rb_entry *entry) { struct rb_node **link = &tracker->tree.rb_root.rb_node; struct rb_node *parent = NULL; bool leftmost = true; while (*link) { struct ceph_subvol_metric_rb_entry *cur = rb_entry(*link, struct ceph_subvol_metric_rb_entry, node); parent = *link; if (entry->subvolume_id < cur->subvolume_id) link = &(*link)->rb_left; else if (entry->subvolume_id > cur->subvolume_id) { link = &(*link)->rb_right; leftmost = false; } else return cur; } rb_link_node(&entry->node, parent, link); rb_insert_color_cached(&entry->node, &tracker->tree, leftmost); tracker->nr_entries++; return entry; } static void ceph_subvolume_metrics_clear_locked( struct ceph_subvolume_metrics_tracker *tracker) { struct rb_node *node = rb_first_cached(&tracker->tree); while (node) { struct ceph_subvol_metric_rb_entry *entry = rb_entry(node, struct ceph_subvol_metric_rb_entry, node); struct rb_node *next = rb_next(node); rb_erase_cached(&entry->node, &tracker->tree); tracker->nr_entries--; kmem_cache_free(ceph_subvol_metric_entry_cachep, entry); node = next; } tracker->tree = RB_ROOT_CACHED; } void ceph_subvolume_metrics_destroy(struct ceph_subvolume_metrics_tracker *tracker) { spin_lock(&tracker->lock); ceph_subvolume_metrics_clear_locked(tracker); tracker->enabled = false; spin_unlock(&tracker->lock); } void ceph_subvolume_metrics_enable(struct ceph_subvolume_metrics_tracker *tracker, bool enable) { spin_lock(&tracker->lock); if (enable) { tracker->enabled = true; } else { tracker->enabled = false; ceph_subvolume_metrics_clear_locked(tracker); } spin_unlock(&tracker->lock); } void ceph_subvolume_metrics_record(struct ceph_subvolume_metrics_tracker *tracker, u64 subvol_id, bool is_write, size_t size, u64 latency_us) { struct ceph_subvol_metric_rb_entry *entry, *new_entry = NULL; bool retry = false; /* CEPH_SUBVOLUME_ID_NONE (0) means unknown/unset subvolume */ if (!READ_ONCE(tracker->enabled) || subvol_id == CEPH_SUBVOLUME_ID_NONE || !size || !latency_us) return; /* * Retry loop for lock-free allocation pattern: * 1. First iteration: lookup under lock, if miss -> drop lock, alloc, retry * 2. Second iteration: lookup again (may have been inserted), insert if still missing * 3. On race (another thread inserted same key): free our alloc, retry * All successful paths exit via return, so retry flag doesn't need reset. */ do { spin_lock(&tracker->lock); if (!tracker->enabled) { spin_unlock(&tracker->lock); if (new_entry) kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry); return; } entry = __lookup_entry(tracker, subvol_id); if (!entry) { if (!new_entry) { spin_unlock(&tracker->lock); new_entry = kmem_cache_zalloc(ceph_subvol_metric_entry_cachep, GFP_NOFS); if (!new_entry) return; new_entry->subvolume_id = subvol_id; retry = true; continue; } entry = __insert_entry(tracker, new_entry); if (entry != new_entry) { /* raced with another insert */ spin_unlock(&tracker->lock); kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry); new_entry = NULL; retry = true; continue; } new_entry = NULL; } if (is_write) { entry->write_ops++; entry->write_bytes += size; entry->write_latency_us += latency_us; atomic64_inc(&tracker->total_write_ops); atomic64_add(size, &tracker->total_write_bytes); } else { entry->read_ops++; entry->read_bytes += size; entry->read_latency_us += latency_us; atomic64_inc(&tracker->total_read_ops); atomic64_add(size, &tracker->total_read_bytes); } spin_unlock(&tracker->lock); if (new_entry) kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry); return; } while (retry); } int ceph_subvolume_metrics_snapshot(struct ceph_subvolume_metrics_tracker *tracker, struct ceph_subvol_metric_snapshot **out, u32 *nr, bool consume) { struct ceph_subvol_metric_snapshot *snap = NULL; struct rb_node *node; u32 count = 0, idx = 0; int ret = 0; *out = NULL; *nr = 0; if (!READ_ONCE(tracker->enabled)) return 0; atomic64_inc(&tracker->snapshot_attempts); spin_lock(&tracker->lock); for (node = rb_first_cached(&tracker->tree); node; node = rb_next(node)) { struct ceph_subvol_metric_rb_entry *entry = rb_entry(node, struct ceph_subvol_metric_rb_entry, node); /* Include entries with ANY I/O activity (read OR write) */ if (entry->read_ops || entry->write_ops) count++; } spin_unlock(&tracker->lock); if (!count) { atomic64_inc(&tracker->snapshot_empty); return 0; } snap = kcalloc(count, sizeof(*snap), GFP_NOFS); if (!snap) { atomic64_inc(&tracker->snapshot_failures); return -ENOMEM; } spin_lock(&tracker->lock); node = rb_first_cached(&tracker->tree); while (node) { struct ceph_subvol_metric_rb_entry *entry = rb_entry(node, struct ceph_subvol_metric_rb_entry, node); struct rb_node *next = rb_next(node); /* Skip entries with NO I/O activity at all */ if (!entry->read_ops && !entry->write_ops) { rb_erase_cached(&entry->node, &tracker->tree); tracker->nr_entries--; kmem_cache_free(ceph_subvol_metric_entry_cachep, entry); node = next; continue; } if (idx >= count) { pr_warn("ceph: subvol metrics snapshot race (idx=%u count=%u)\n", idx, count); break; } snap[idx].subvolume_id = entry->subvolume_id; snap[idx].read_ops = entry->read_ops; snap[idx].write_ops = entry->write_ops; snap[idx].read_bytes = entry->read_bytes; snap[idx].write_bytes = entry->write_bytes; snap[idx].read_latency_us = entry->read_latency_us; snap[idx].write_latency_us = entry->write_latency_us; idx++; if (consume) { entry->read_ops = 0; entry->write_ops = 0; entry->read_bytes = 0; entry->write_bytes = 0; entry->read_latency_us = 0; entry->write_latency_us = 0; rb_erase_cached(&entry->node, &tracker->tree); tracker->nr_entries--; kmem_cache_free(ceph_subvol_metric_entry_cachep, entry); } node = next; } spin_unlock(&tracker->lock); if (!idx) { kfree(snap); snap = NULL; ret = 0; } else { *nr = idx; *out = snap; } return ret; } void ceph_subvolume_metrics_free_snapshot(struct ceph_subvol_metric_snapshot *snapshot) { kfree(snapshot); } /* * Dump subvolume metrics to a seq_file for debugfs. * * Iterates the rb-tree directly under spinlock to avoid allocation. * The lock hold time is minimal since we're only doing seq_printf calls. */ void ceph_subvolume_metrics_dump(struct ceph_subvolume_metrics_tracker *tracker, struct seq_file *s) { struct rb_node *node; bool found = false; spin_lock(&tracker->lock); if (!tracker->enabled) { spin_unlock(&tracker->lock); seq_puts(s, "subvolume metrics disabled\n"); return; } for (node = rb_first_cached(&tracker->tree); node; node = rb_next(node)) { struct ceph_subvol_metric_rb_entry *entry = rb_entry(node, struct ceph_subvol_metric_rb_entry, node); u64 avg_rd_lat, avg_wr_lat; if (!entry->read_ops && !entry->write_ops) continue; if (!found) { seq_puts(s, "subvol_id rd_ops rd_bytes rd_avg_lat_us wr_ops wr_bytes wr_avg_lat_us\n"); seq_puts(s, "------------------------------------------------------------------------------------------------\n"); found = true; } avg_rd_lat = entry->read_ops ? div64_u64(entry->read_latency_us, entry->read_ops) : 0; avg_wr_lat = entry->write_ops ? div64_u64(entry->write_latency_us, entry->write_ops) : 0; seq_printf(s, "%-15llu%-10llu%-12llu%-16llu%-10llu%-12llu%-16llu\n", entry->subvolume_id, entry->read_ops, entry->read_bytes, avg_rd_lat, entry->write_ops, entry->write_bytes, avg_wr_lat); } spin_unlock(&tracker->lock); if (!found) seq_puts(s, "(no subvolume metrics collected)\n"); } void ceph_subvolume_metrics_record_io(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci, bool is_write, size_t bytes, ktime_t start, ktime_t end) { struct ceph_subvolume_metrics_tracker *tracker; u64 subvol_id; s64 delta_us; if (!mdsc || !ci || !bytes) return; tracker = &mdsc->subvol_metrics; atomic64_inc(&tracker->record_calls); if (!ceph_subvolume_metrics_enabled(tracker)) { atomic64_inc(&tracker->record_disabled); return; } subvol_id = READ_ONCE(ci->i_subvolume_id); if (subvol_id == CEPH_SUBVOLUME_ID_NONE) { atomic64_inc(&tracker->record_no_subvol); return; } delta_us = ktime_to_us(ktime_sub(end, start)); if (delta_us <= 0) delta_us = 1; ceph_subvolume_metrics_record(tracker, subvol_id, is_write, bytes, (u64)delta_us); } int __init ceph_subvolume_metrics_cache_init(void) { ceph_subvol_metric_entry_cachep = KMEM_CACHE(ceph_subvol_metric_rb_entry, SLAB_RECLAIM_ACCOUNT); if (!ceph_subvol_metric_entry_cachep) return -ENOMEM; return 0; } void ceph_subvolume_metrics_cache_destroy(void) { kmem_cache_destroy(ceph_subvol_metric_entry_cachep); }