Topics

[RFC PATCH 03/11] lib: implement perf event ringbuffer for upcall.

William Tu
 

From: Joe Stringer <joe@...>

A flow missed by the match action table in ebpf triggers an upcall,
which forwards the information to ovs-vswitchd using skb_perf_event_output
helper function. The patch implements the userspace receiving logic.

Signed-off-by: Joe Stringer <joe@...>
Signed-off-by: William Tu <u9012063@...>
---
lib/perf-event.c | 288 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
lib/perf-event.h | 43 +++++++++
2 files changed, 331 insertions(+)
create mode 100644 lib/perf-event.c
create mode 100644 lib/perf-event.h

diff --git a/lib/perf-event.c b/lib/perf-event.c
new file mode 100644
index 000000000000..c51c936033db
--- /dev/null
+++ b/lib/perf-event.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2016 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include "perf-event.h"
+
+#include <errno.h>
+#include <linux/perf_event.h>
+#include <linux/unistd.h>
+#include <openvswitch/vlog.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "coverage.h"
+#include "openvswitch/util.h"
+#include "ovs-atomic.h"
+
+VLOG_DEFINE_THIS_MODULE(perf_event);
+
+COVERAGE_DEFINE(perf_lost);
+COVERAGE_DEFINE(perf_sample);
+COVERAGE_DEFINE(perf_unknown);
+
+struct perf_event_lost {
+ struct perf_event_header header;
+ uint64_t id;
+ uint64_t lost;
+};
+
+struct rb_cursor {
+ struct perf_event_mmap_page *page;
+ uint64_t head, tail;
+};
+
+static int
+perf_event_open_fd(int *fd_out, int cpu)
+{
+ struct perf_event_attr attr = {
+ .type = PERF_TYPE_SOFTWARE,
+ .size = sizeof(struct perf_event_attr),
+ .config = PERF_COUNT_SW_BPF_OUTPUT,
+ .sample_type = PERF_SAMPLE_RAW,
+ .watermark = 0,
+ .wakeup_events = 1,
+ };
+ int fd, error;
+
+ fd = syscall(__NR_perf_event_open, &attr, -1, cpu, -1, 0);
+ if (fd < 0) {
+ error = errno;
+ VLOG_ERR("failed to open perf events (%s)", ovs_strerror(error));
+ return error;
+ }
+
+ if (ioctl(fd, PERF_EVENT_IOC_RESET, 1) == -1) {
+ error = errno;
+ VLOG_ERR("failed to reset perf events (%s)", ovs_strerror(error));
+ return error;
+ }
+
+ *fd_out = fd;
+ return 0;
+}
+
+int
+perf_channel_open(struct perf_channel *channel, int cpu, size_t page_len)
+{
+ int fd = 0, error;
+ void *page;
+
+ error = perf_event_open_fd(&fd, cpu);
+ if (error) {
+ VLOG_WARN("failed to open perf channel (cpu %d): %s",
+ cpu, ovs_strerror(error));
+ return error;
+ }
+
+ page = mmap(NULL, page_len, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (page == MAP_FAILED) {
+ error = errno;
+ VLOG_ERR("failed to mmap perf event fd (cpu %d): %s",
+ cpu, ovs_strerror(error));
+ close(fd);
+ return error;
+ }
+ channel->page = page;
+ channel->cpu = cpu;
+ channel->fd = fd;
+ channel->length = page_len;
+
+ return 0;
+}
+
+int
+perf_channel_set(struct perf_channel *channel, bool enable)
+{
+ int request = enable ? PERF_EVENT_IOC_ENABLE : PERF_EVENT_IOC_DISABLE;
+
+ if (ioctl(channel->fd, request, 0) == -1) {
+ return errno;
+ }
+ return 0;
+}
+
+void
+perf_channel_close(struct perf_channel *channel)
+{
+ if (ioctl(channel->fd, PERF_EVENT_IOC_DISABLE, 0) == -1) {
+ int error = errno;
+ VLOG_ERR("failed to disable perf events (%s)",
+ ovs_strerror(error));
+ }
+
+ if (munmap((void *)channel->page, channel->length)) {
+ VLOG_WARN("Failed to unmap page for cpu %d: %s",
+ channel->cpu, ovs_strerror(errno));
+ }
+ if (close(channel->fd)) {
+ VLOG_WARN("Failed to close page for cpu %d: %s",
+ channel->cpu, ovs_strerror(errno));
+ }
+ channel->page = NULL;
+ channel->fd = 0;
+ channel->length = 0;
+}
+
+static uint8_t *
+rb_base(struct rb_cursor *cursor)
+{
+ return ((uint8_t *)cursor->page) + cursor->page->data_offset;
+}
+
+static uint8_t *
+rb_end(struct rb_cursor *cursor)
+{
+ return rb_base(cursor) + cursor->page->data_size;
+}
+
+static uint64_t
+cursor_event_offset(struct rb_cursor *cursor)
+{
+ return cursor->tail % cursor->page->data_size;
+}
+
+static uint64_t
+cursor_end_offset(struct rb_cursor *cursor)
+{
+ return cursor->head % cursor->page->data_size;
+}
+
+static void *
+cursor_peek(struct rb_cursor *cursor)
+{
+ void *next = rb_base(cursor) + cursor_event_offset(cursor);
+ void *end = rb_base(cursor) + cursor_end_offset(cursor);
+
+ return (next != end) ? next : NULL;
+}
+
+static uint8_t *
+event_end(struct perf_event_header *header)
+{
+ return (uint8_t *)header + header->size;
+}
+
+static bool
+init_cursor(struct rb_cursor *cursor,
+ struct perf_event_mmap_page *page)
+{
+ uint64_t head = *((volatile uint64_t *)&page->data_head);
+ uint64_t tail = page->data_tail;
+
+ /* Separate the read of 'data_head' from the read of the ringbuffer data.*/
+ atomic_thread_fence(memory_order_consume);
+
+ cursor->page = page;
+ cursor->head = head;
+ cursor->tail = tail;
+
+ return head != tail;
+}
+
+static void
+perf_event_pull(struct perf_event_mmap_page *page, uint64_t tail)
+{
+ /* Separate reads in the ringbuffer from the writing of the tail. */
+ atomic_thread_fence(memory_order_release);
+ page->data_tail = tail;
+}
+
+static bool
+perf_event_copy(struct rb_cursor *cursor, struct ofpbuf *buffer)
+{
+ struct perf_event_header *header = cursor_peek(cursor);
+
+ if (!header) {
+ return false;
+ }
+
+ ofpbuf_clear(buffer);
+ if (event_end(header) <= rb_end(cursor)) {
+ ofpbuf_push(buffer, header, header->size);
+ } else {
+ uint64_t seg1_len = rb_end(cursor) - (uint8_t *)header;
+ uint64_t seg2_len = header->size - seg1_len;
+
+ ofpbuf_put(buffer, header, seg1_len);
+ ofpbuf_put(buffer, rb_base(cursor), seg2_len);
+ }
+
+ buffer->header = buffer->data;
+ cursor->tail += header->size;
+
+ return true;
+}
+
+/* Reads the next full perf event from 'channel' into 'buffer'.
+ *
+ * 'buffer' may be reallocated, so the caller must subsequently uninitialize
+ * it. 'buf->header' will be updated to point to the beginning of the event,
+ * which starts with a 'struct perf_event_header'.
+ *
+ * Returns 0 if there is a new OVS event, otherwise a positive errno value.
+ * Returns EAGAIN if there are no new events.
+ */
+int
+perf_channel_read(struct perf_channel *channel, struct ofpbuf *buffer)
+{
+ struct rb_cursor cursor;
+ int error = EAGAIN;
+
+ if (!init_cursor(&cursor, channel->page)) {
+ return error;
+ }
+
+ if (perf_event_copy(&cursor, buffer)) {
+ struct perf_event_header *header = buffer->header;
+
+ switch (header->type) {
+ case PERF_RECORD_SAMPLE:
+ /* Success! */
+ COVERAGE_INC(perf_sample);
+ error = 0;
+ break;
+ case PERF_RECORD_LOST: {
+ struct perf_event_lost *e = buffer->header;
+ COVERAGE_ADD(perf_lost, e->lost);
+ error = ENOBUFS;
+ break;
+ }
+ default:
+ COVERAGE_INC(perf_unknown);
+ error = EPROTO;
+ break;
+ }
+
+ perf_event_pull(channel->page, cursor.tail);
+ }
+
+ return error;
+}
+
+void
+perf_channel_flush(struct perf_channel *channel)
+{
+ struct perf_event_mmap_page *page = channel->page;
+ uint64_t head = *((volatile uint64_t *)&page->data_head);
+
+ /* The memory_order_consume fence is unnecessary when we don't read any
+ * of the data from the ringbuffer - see perf_output_put_handle().
+ * However, we still need to order the above read wrt to the tail write. */
+ perf_event_pull(page, head);
+}
diff --git a/lib/perf-event.h b/lib/perf-event.h
new file mode 100644
index 000000000000..74bc8e961dbc
--- /dev/null
+++ b/lib/perf-event.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2016 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PERF_EVENT_H
+#define PERF_EVENT_H 1
+
+#include <linux/perf_event.h>
+#include "openvswitch/ofpbuf.h"
+#include "openvswitch/types.h"
+
+struct perf_event_raw {
+ struct perf_event_header header;
+ uint32_t size;
+ /* Followed by uint8_t data[size]; */
+};
+
+struct perf_channel {
+ struct perf_event_mmap_page *page;
+ int cpu;
+ int fd;
+ size_t length;
+};
+
+int perf_channel_open(struct perf_channel *, int cpu, size_t page_len);
+int perf_channel_set(struct perf_channel *channel, bool enable);
+int perf_channel_read(struct perf_channel *, struct ofpbuf *);
+void perf_channel_flush(struct perf_channel *);
+void perf_channel_close(struct perf_channel *);
+
+#endif /* PERF_EVENT_H */
--
2.7.4