Software APIs
memory_perftest.c
1 // Copyright lowRISC contributors (OpenTitan project).
2 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
3 // SPDX-License-Identifier: Apache-2.0
4 
5 #include <stdbool.h>
6 #include <stddef.h>
7 #include <stdint.h>
8 
13 #include "sw/device/lib/testing/test_framework/check.h"
15 
16 enum {
17  kBufLen = 1000,
18  kNumRuns = 10,
19 };
20 
21 typedef struct perf_test {
22  // A human-readable name for this particular test, e.g. "memcpy".
23  const char *label;
24 
25  // Setup functions for each context buffer. These function pointers must not
26  // be NULL. The runtime of these functions will not be measured.
27  void (*setup_buf1)(uint8_t *, size_t);
28  void (*setup_buf2)(uint8_t *, size_t);
29 
30  // A function that exercises the function under test, e.g. memcpy. This
31  // function pointer must not be NULL. The runtime of this function will be
32  // measured.
33  void (*func)(uint8_t *buf1, uint8_t *buf2, size_t num_runs);
34 
35  // The expected number of CPU cycles that `func` will take to run.
36  size_t expected_max_num_cycles;
37 } perf_test_t;
38 
39 // Run the given `perf_test_t` and return the number of cycles it took.
40 static inline uint64_t perf_test_run(const perf_test_t *test, uint8_t *buf1,
41  uint8_t *buf2, size_t num_runs) {
42  CHECK(test->setup_buf1 != NULL);
43  CHECK(test->setup_buf2 != NULL);
44  CHECK(test->func != NULL);
45 
46  uint64_t total_clock_cycles = 0;
47  for (size_t i = 0; i < num_runs; ++i) {
48  test->setup_buf1(buf1, kBufLen);
49  test->setup_buf2(buf2, kBufLen);
50 
51  uint64_t start_cycles = ibex_mcycle_read();
52  test->func(buf1, buf2, kBufLen);
53  uint64_t end_cycles = ibex_mcycle_read();
54 
55  // Even if the 64-bit cycle counter overflowed while running the test, the
56  // following subtraction would still be well-defined and correct, provided
57  // that the test ran in fewer than 2**64 cycles. Practically, we should
58  // never see it overflow. Even if it took 12 hours to run all the perf
59  // tests, the clock would have to run at 427 THz to overflow the counter
60  // (2**64 cycles / (12 * 60 * 60) seconds).
61  const uint64_t num_cycles = end_cycles - start_cycles;
62 
63  CHECK(total_clock_cycles < UINT64_MAX - num_cycles);
64  total_clock_cycles += num_cycles;
65  }
66 
67  return total_clock_cycles;
68 }
69 
70 // Fill the buffer with arbitrary, but deterministically-selected bytes.
71 static inline void fill_buf_deterministic_values(uint8_t *buf, size_t len) {
72  uint32_t state = 42;
73  for (size_t i = 0; i < len; ++i) {
74  state = state * 17 + i;
75  buf[i] = (uint8_t)state;
76  }
77 }
78 
79 // Zero out the buffer.
80 static inline void fill_buf_zeroes(uint8_t *buf, size_t len) {
81  memset(buf, 0, len);
82 }
83 
84 // Zero out the buffer, but put a one at the end.
85 static inline void fill_buf_zeroes_then_one(uint8_t *buf, size_t len) {
86  fill_buf_zeroes(buf, len);
87  buf[len - 1] = 1;
88 }
89 
90 // Zero out the buffer, but put a one at the beginning.
91 static inline void fill_buf_one_then_zeroes(uint8_t *buf, size_t len) {
92  fill_buf_zeroes(buf, len);
93  buf[0] = 1;
94 }
95 
96 OT_NOINLINE void test_memcpy(uint8_t *buf1, uint8_t *buf2, size_t len) {
97  memcpy(buf1, buf2, len);
98 }
99 
100 OT_NOINLINE void test_memset(uint8_t *buf1, uint8_t *buf2, size_t len) {
101  const int value = buf2[0];
102  memset(buf1, value, len);
103 }
104 
105 OT_NOINLINE void test_memcmp(uint8_t *buf1, uint8_t *buf2, size_t len) {
106  memcmp(buf1, buf2, len);
107 }
108 
109 OT_NOINLINE void test_memrcmp(uint8_t *buf1, uint8_t *buf2, size_t len) {
110  memrcmp(buf1, buf2, len);
111 }
112 
113 OT_NOINLINE void test_memchr(uint8_t *buf1, uint8_t *buf2, size_t len) {
114  const uint8_t value = buf1[len - 1];
115  memchr(buf1, value, len);
116 }
117 
118 OT_NOINLINE void test_memrchr(uint8_t *buf1, uint8_t *buf2, size_t len) {
119  const uint8_t value = buf1[0];
120  memrchr(buf1, value, len);
121 }
122 
123 OTTF_DEFINE_TEST_CONFIG();
124 
125 // Each value of `expected_max_num_cycles` was determined experimentally by
126 // testing on a CW310 FPGA with the following command:
127 //
128 // $ ./bazelisk.sh test --copt -O2 --test_output=all \
129 // //sw/device/lib/base:memory_perftest_fpga_cw310
130 //
131 // There are a handful of reasons why the expected number of cycles for this
132 // test might be inaccurate. Here are a few of them:
133 //
134 // (1) You've changed the definition of the test, e.g. changing the size of
135 // the test buffers.
136 // (2) You've changed the implementation of memset, memcpy, etc. and they can
137 // do the same job in fewer cycles.
138 // (3) The test was not compiled with `-O2`. The hardcoded cycle count
139 // expectations assume `-O2`.
140 // (4) The compiler has gotten smarter.
141 // (5) The icache gets turned on prior to test execution.
142 //
143 // If you observe the cycle count is smaller the hardcoded expectation, that's
144 // probably a good thing; consider updating the expectation!
145 static const perf_test_t kPerfTests[] = {
146  {
147  .label = "memcpy",
148  .setup_buf1 = &fill_buf_deterministic_values,
149  .setup_buf2 = &fill_buf_deterministic_values,
150  .func = &test_memcpy,
151  .expected_max_num_cycles = 33270,
152  },
153  {
154  .label = "memcpy_zeroes",
155  .setup_buf1 = &fill_buf_deterministic_values,
156  .setup_buf2 = &fill_buf_zeroes,
157  .func = &test_memcpy,
158  .expected_max_num_cycles = 33270,
159  },
160  {
161  .label = "memset",
162  .setup_buf1 = &fill_buf_zeroes,
163  .setup_buf2 = &fill_buf_deterministic_values,
164  .func = &test_memset,
165  .expected_max_num_cycles = 23200,
166  },
167  {
168  .label = "memset_zeroes",
169  .setup_buf1 = &fill_buf_zeroes,
170  .setup_buf2 = &fill_buf_zeroes,
171  .func = &test_memset,
172  .expected_max_num_cycles = 23200,
173  },
174  {
175  .label = "memcmp_pathological",
176  .setup_buf1 = &fill_buf_zeroes_then_one,
177  .setup_buf2 = &fill_buf_zeroes,
178  .func = &test_memcmp,
179  .expected_max_num_cycles = 110740,
180  },
181  {
182  .label = "memcmp_zeroes",
183  .setup_buf1 = &fill_buf_zeroes,
184  .setup_buf2 = &fill_buf_zeroes,
185  .func = &test_memcmp,
186  .expected_max_num_cycles = 110740,
187  },
188  {
189  .label = "memrcmp_pathological",
190  .setup_buf1 = &fill_buf_zeroes,
191  .setup_buf2 = &fill_buf_one_then_zeroes,
192  .func = &test_memrcmp,
193  .expected_max_num_cycles = 50740,
194  },
195  {
196  .label = "memrcmp_zeroes",
197  .setup_buf1 = &fill_buf_zeroes,
198  .setup_buf2 = &fill_buf_zeroes,
199  .func = &test_memrcmp,
200  .expected_max_num_cycles = 50850,
201  },
202  {
203  .label = "memchr_pathological",
204  .setup_buf1 = &fill_buf_deterministic_values,
205  .setup_buf2 = &fill_buf_zeroes,
206  .func = &test_memchr,
207  .expected_max_num_cycles = 7250,
208  },
209  {
210  .label = "memrchr_pathological",
211  .setup_buf1 = &fill_buf_deterministic_values,
212  .setup_buf2 = &fill_buf_deterministic_values,
213  .func = &test_memrchr,
214  .expected_max_num_cycles = 23850,
215  },
216 };
217 
218 static uint8_t buf1[kBufLen];
219 static uint8_t buf2[kBufLen];
220 
221 bool test_main(void) {
222  bool all_expectations_match = true;
223  for (size_t i = 0; i < ARRAYSIZE(kPerfTests); ++i) {
224  const perf_test_t *test = &kPerfTests[i];
225 
226  const uint64_t num_cycles = perf_test_run(test, buf1, buf2, kNumRuns);
227  if (num_cycles > test->expected_max_num_cycles) {
228  all_expectations_match = false;
229  // Cast cycle counts to `uint32_t` before printing because `base_printf()`
230  // cannot print `uint64_t`.
231  CHECK(test->expected_max_num_cycles < UINT32_MAX);
232  CHECK(num_cycles < UINT32_MAX);
233  const uint32_t expected_max_num_cycles_u32 =
234  (uint32_t)test->expected_max_num_cycles;
235  const uint32_t num_cycles_u32 = (uint32_t)num_cycles;
236 
237  CHECK(num_cycles < UINT32_MAX / 100);
238  const uint32_t percent_change =
239  (100 * num_cycles_u32) / expected_max_num_cycles_u32;
240 
241  LOG_WARNING(
242  "%s:\n"
243  " Expected: %10d cycles\n"
244  " Actual: %10d cycles\n"
245  " Actual/Expected: %10d%%\n",
246  test->label, expected_max_num_cycles_u32, num_cycles_u32,
247  percent_change);
248  }
249  }
250  return all_expectations_match;
251 }