Software APIs
hardened.h
Go to the documentation of this file.
1 // Copyright lowRISC contributors (OpenTitan project).
2 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
3 // SPDX-License-Identifier: Apache-2.0
4 
5 #ifndef OPENTITAN_SW_DEVICE_LIB_BASE_HARDENED_H_
6 #define OPENTITAN_SW_DEVICE_LIB_BASE_HARDENED_H_
7 
8 #include <stdint.h>
9 
10 #include "sw/device/lib/base/hardened_asm.h"
13 
14 /**
15  * @file
16  * @brief Data Types for use in Hardened Code.
17  */
18 
19 #ifdef __cplusplus
20 extern "C" {
21 #endif // __cplusplus
22 
23 /**
24  * This is a boolean type for use in hardened contexts.
25  *
26  * The intention is that this is used instead of `<stdbool.h>`'s #bool, where a
27  * higher hamming distance is required between the truthy and the falsey value.
28  *
29  * The values below were chosen at random, with some specific restrictions. They
30  * have a Hamming Distance of 8, and they are 11-bit values so they can be
31  * materialized with a single instruction on RISC-V. They are also specifically
32  * not the complement of each other.
33  */
34 typedef enum hardened_bool {
35  /**
36  * The truthy value, expected to be used like #true.
37  */
38  kHardenedBoolTrue = HARDENED_BOOL_TRUE,
39  /**
40  * The falsey value, expected to be used like #false.
41  */
42  kHardenedBoolFalse = HARDENED_BOOL_FALSE,
44 
45 /**
46  * A byte-sized hardened boolean.
47  *
48  * This type is intended for cases where a byte-sized hardened boolean is
49  * required, e.g. for the entries of the `CREATOR_SW_CFG_SIGVERIFY_RSA_KEY_EN`
50  * OTP item.
51  *
52  * The values below were chosen to ensure that the hamming difference between
53  * them is greater than 5 and they are not bitwise complements of each other.
54  */
55 typedef enum hardened_byte_bool {
56  /**
57  * The truthy value.
58  */
59  kHardenedByteBoolTrue = HARDENED_BYTE_BOOL_TRUE,
60  /**
61  * The falsy value.
62  */
63  kHardenedByteBoolFalse = HARDENED_BYTE_BOOL_FALSE,
65 
66 /*
67  * Launders the 32-bit value `val`.
68  *
69  * `launder32()` returns a value identical to the input, while introducing an
70  * optimization barrier that prevents the compiler from learning new information
71  * about the original value, based on operations on the laundered value. This
72  * does not work the other way around: some information that the compiler has
73  * already learned about the value may make it through; this last caveat is
74  * explained below.
75  *
76  * In some circumstances, it is desirable to introduce a redundant (from the
77  * compiler's perspective) operation into the instruction stream to make it less
78  * likely that hardware faults will result in critical operations being
79  * bypassed. Laundering makes it possible to insert such duplicate operations,
80  * while blocking the compiler's attempts to delete them through dead code
81  * elimination.
82  *
83  * Suppose we have a pure-C `CHECK()` macro that does a runtime-assert.
84  * For example, in the following code, a compiler would fold `CHECK()` away,
85  * since dataflow analysis could tell it that `x` is always zero, allowing it to
86  * deduce that `x == 0` -> `0 == 0` -> `true`. It then deletes the `CHECK(true)`
87  * as dead code.
88  * ```
89  * if (x == 0) {
90  * CHECK(x == 0);
91  * }
92  * ```
93  *
94  * If we believe that an attacker can glitch the chip to skip the first branch,
95  * this assumption no longer holds. We can use laundering to prevent LLVM from
96  * learning that `x` is zero inside the block:
97  * ```
98  * if (launder32(x) == 0) {
99  * CHECK(x == 0);
100  * }
101  * ```
102  *
103  * Note that this operation is order sensitive: while we can stop the compiler
104  * from learning new information, it is very hard to make it forget in some
105  * cases. If we had instead written
106  * ```
107  * if (x == 0) {
108  * CHECK(launder32(x) == 0);
109  * }
110  * ```
111  * then it would be entitled to rewrite this into `launder32(0) == 0`.
112  * Although it can't make the deduction that this is identically true, because
113  * `launder32()` is the identity function, this behaves like `0 == 0` at
114  * runtime. For example, a completely valid lowering of this code is
115  * ```
116  * bnez a0, else
117  * mv a0, zero
118  * bnez a0, shutdown
119  * // ...
120  * ```
121  *
122  * Even pulling the expression out of the branch as `uint32_t y = launder32(x);`
123  * doesn't work, because the compiler may chose to move the operation into the
124  * branch body. Thus, we need to stop it from learning that `x` is zero in the
125  * first place.
126  *
127  * Note that, in spite of `HARDENED_CHECK` being written in assembly, it is
128  * still vulnerable to certain inimical compiler optimizations. For example,
129  * `HARDENED_CHECK_EQ(x, 0)` can be written to `HARDENED_CHECK_EQ(0, 0)`.
130  * Although the compiler cannot delete this code, it will emit the nonsensical
131  * ```
132  * beqz zero, continue
133  * unimp
134  * continue:
135  * ```
136  * effectively negating the doubled condition.
137  *
138  * ---
139  *
140  * In general, this intrinsic should *only* be used on values that the compiler
141  * doesn't already know anything interesting about. The precise meaning of this
142  * can be subtle, since inlining can easily foil pessimization. Uses of this
143  * function must be carefully reviewed, and commented with justification and an
144  * explanation of what information it is preventing access to, and what ambient
145  * invariants it relies on.
146  *
147  * This function makes no guarantees: it is only intended to help harden code.
148  * It does not remove the need to carefully verify that the correct
149  * instruction-level invariants are upheld in the final release.
150  *
151  * This operation *does not* introduce a sequence point: a compiler may move it
152  * anywhere between where its input is last modified or its output is first
153  * moved. This can include moving through different nodes in the control flow
154  * graph. The location of the laundering operation must be determined
155  * *exclusively* by its position in the expression dependency graph.
156  *
157  * Other examples of laundering use-cases should be listed here as they become
158  * apparent.
159  *
160  * * Obscuring loop completion. It may be useful to prevent a compiler from
161  * learning that a particular loop is guaranteed to complete. The most
162  * straightforward way to do this is to launder the loop increment: replace
163  * `++i` with `i = launder32(i) + 1`. This is helpful for preventing the
164  * compiler from learning that the loop executes in a particular order, and
165  * foiling unroll/unroll-and-jam optimizations. It also prevents the compiler
166  * from learning that the loop counter was saturated. However, if the exit
167  * condition is `i < len`, the compiler will learn that `i >= len` outside the
168  * loop.
169  *
170  * Laundering just the exit comparison, `launder32(i) < len`, will still allow
171  * the compiler to deduce that the loop is traversed in linear order, but it
172  * will not learn that `i >= len` outside the loop. Laundering both loop
173  * control expressions may be necessary in some cases.
174  *
175  * * Assigning a literal to a value without the compiler learning about
176  * that literal value. `x = launder32(0);` zeros `x`, but the compiler
177  * cannot replace all uses of it with a constant `0`. Note that
178  * `x = launder32(x);` does not prevent knowledge the compiler already has
179  * about `x` from replacing it with a constant in `launder32()`.
180  *
181  * * Preventing operations from being re-associated. The compiler is entitled
182  * to rewrite `x ^ (y ^ z)` into `(x ^ y) ^ z`, but cannot do the same with
183  * `x ^ launder32(y ^ z)`. No operations commute through `launder32()`.
184  *
185  * * Preventing dead-code elimination. The compiler cannot delete the
186  * unreachable expression `if (launder32(false)) { foo(); }`. However, the
187  * branch will never be executed in an untampered instruction stream.
188  *
189  * @param val A 32-bit integer to launder.
190  * @return A 32-bit integer which will *happen* to have the same value as `val`
191  * at runtime.
192  */
194 inline uint32_t launder32(uint32_t val) {
195  // NOTE: This implementation is LLVM-specific, and should be considered to be
196  // a no-op in every other compiler. For example, GCC has in the past peered
197  // into the insides of assembly blocks.
198  //
199  // LLVM cannot see through assembly blocks, and must treat this as a black
200  // box. Similar tricks are employed in other security-sensitive code-bases,
201  // such as https://boringssl-review.googlesource.com/c/boringssl/+/36484.
202  //
203  // # "To volatile or not to volatile, that is the question."
204  // Naively, this snippet would not be marked volatile, since we do not care
205  // about reordering. However, there are rare optimizations that can result
206  // in "miscompilation" of this primitive. For example, if the argument is
207  // a complex expression, we get something like the following Godbolt:
208  // https://godbolt.org/z/c7M7Yr7Yo.
209  //
210  // This is not actually a miscompilation: LLVM is treating the asm
211  // statement as behaving like a random oracle determined entirely by its
212  // arguments; therefore, it is entitled to deduplicate both occurences of
213  // `LaunderPure(y)` (in this particular case, from bisecting the passes,
214  // it appears to happen during register allocation).
215  //
216  // We work around this by marking the inline asm volatile.
217  //
218  // Alternatively, we could add a "nonce" to the inline asm that has been
219  // tainted by a volatile asm operation. This has the benefit that the input
220  // does not need to happen-before the volatile operation, and can be
221  // arbitrarily reordered, while ensuring that no call of launder32() is
222  // "pure" in the deduplication sense. I.e.:
223  //
224  // uint32_t nonce;
225  // asm volatile("" : "=r"(nonce));
226  // asm("" : "+r"(val) : "r"(nonce));
227  //
228  // At the time of writing, it seems preferable to have something we know is
229  // correct rather than being overly clever; this is recorded here in case
230  // the current implementation is unsuitable and we need something more
231  // carefully tuned.
232  //
233  // This comment was formerly present to justify the lack of volatile. It is
234  // left behind as a warning to not try to remove it without further careful
235  // analysis.
236  // > It is *not* marked as volatile, since reordering is not desired; marking
237  // > it volatile does make some laundering operations suddenly start working
238  // > spuriously, which is entirely dependent on how excited LLVM is about
239  // > reordering things. To avoid this trap, we do not mark as volatile and
240  // > instead require that reordering be prevented through careful sequencing
241  // > of statements.
242 
243  // When we're building for static analysis, reduce false positives by
244  // short-circuiting the inline assembly block.
245 #if OT_BUILD_FOR_STATIC_ANALYZER || OT_DISABLE_HARDENING
246  return val;
247 #endif
248 
249  // The +r constraint tells the compiler that this is an "inout" parameter: it
250  // means that not only does the black box depend on `val`, but it also mutates
251  // it in an unspecified way.
252  asm volatile("" : "+r"(val));
253  return val;
254 }
255 
256 /**
257  * Launders the pointer-sized value `val`.
258  *
259  * See `launder32()` for detailed semantics.
260  *
261  * @param val A 32-bit integer to launder.
262  * @return A 32-bit integer which will happen to have the same value as `val` at
263  * runtime.
264  */
266 inline uintptr_t launderw(uintptr_t val) {
267 #if OT_BUILD_FOR_STATIC_ANALYZER || OT_DISABLE_HARDENING
268  return val;
269 #endif
270  asm volatile("" : "+r"(val));
271  return val;
272 }
273 
274 /**
275  * Creates a reordering barrier for `val`.
276  *
277  * `barrier32()` takes an argument and discards it, while introducing an
278  * "impure" dependency on that value. This forces the compiler to schedule
279  * instructions such that the intermediate `val` actually appears in a
280  * register. Because it is impure, `barrier32()` operations will not be
281  * reordered past each other or MMIO operations, although they can be reordered
282  * past functions if LTO inlines them.
283  *
284  * Most compilers will try to reorder arithmetic operations in such a way
285  * that they form large basic blocks, since modern microarchitectures can
286  * take advantage of instruction-level parallelism. Unfortunately, this means
287  * that instructions that we want to interleave with other instructions are
288  * likely to get separated; this includes static interleavings,
289  * loop-invariant code motion, and some kinds of unroll-and-jam.
290  *
291  * For example, given
292  *
293  * ```
294  * uint32_t product = 5;
295  *
296  * foo();
297  * product *= product;
298  * foo();
299  * product *= product;
300  * foo();
301  * product *= product;
302  * ```
303  *
304  * A compiler will likely reorder this as
305  *
306  * ```
307  * uint32_t product = 5;
308  *
309  * foo();
310  * foo();
311  * foo();
312  * product *= product;
313  * product *= product;
314  * product *= product;
315  * ```
316  *
317  * The compiler is further entitled to constant-propagate `product`.
318  * `barrier32()` can be used to avoid this:
319  *
320  * ```
321  * // NB: the initial value of `product` is laundered to prevent
322  * // constant propagation, but only because it is a compile-time
323  * // constant. Laundering the intermediates would also work.
324  * uint32_t product = launder32(5);
325  * barrier32(product);
326  *
327  * barrier32(foo());
328  * product *= product;
329  * barrier32(product);
330  *
331  * barrier32(foo());
332  * product *= product;
333  * barrier32(product);
334  *
335  * barrier32(foo());
336  * product *= product;
337  * barrier32(product);
338  * ```
339  *
340  * Note that we must place barriers on the result of the function calls,
341  * too, so that the compiler believes that there is a potential dependency
342  * between the return value of the function, and the followup value of
343  * `product`.
344  *
345  * This is also useful for avoiding loop reordering:
346  *
347  * ```
348  * for (int i = 0; i != n - 1; i = (i + kPrime) % n) {
349  * barrier32(i);
350  *
351  * // Stuff.
352  * }
353  * ```
354  *
355  * A sufficiently intelligent compiler might notice that it can linearize this
356  * loop; however, the barriers in each loop iteration force a particular order
357  * is observed.
358  *
359  * @param val A value to create a barrier for.
360  */
361 inline void barrier32(uint32_t val) { asm volatile("" ::"r"(val)); }
362 
363 /**
364  * Creates a reordering barrier for `val`.
365  *
366  * See `barrier32()` for detailed semantics.
367  *
368  * @param val A value to create a barrier for.
369  */
370 inline void barrierw(uintptr_t val) { asm volatile("" ::"r"(val)); }
371 
372 /**
373  * A constant-time, 32-bit boolean value.
374  *
375  * Values of this type MUST be either all zero bits or all one bits,
376  * representing `false` and `true` respectively.
377  *
378  * Although it is possible to convert an existing `bool` into a `ct_bool32_t` by
379  * writing `-((ct_bool32_t) my_bool)`, we recommend against it
380  */
381 typedef uint32_t ct_bool32_t;
382 
383 // The formulae below are taken from Hacker's Delight, Chapter 2.
384 // Although the book does not define them as being constant-time, they are
385 // branchless; branchless code is always constant-time.
386 //
387 // Proofs and references to HD are provided only in the 32-bit versions.
388 //
389 // Warren Jr., Henry S. (2013). Hacker's Delight (2 ed.).
390 // Addison Wesley - Pearson Education, Inc. ISBN 978-0-321-84268-8.
391 
392 /**
393  * Performs constant-time signed comparison to zero.
394  *
395  * Returns whether `a < 0`, as a constant-time boolean.
396  * In other words, this checks if `a` is negative, i.e., it's sign bit is set.
397  *
398  * @return `a < 0`.
399  */
401 inline ct_bool32_t ct_sltz32(int32_t a) {
402  // Proof. `a` is negative iff its MSB is set;
403  // arithmetic-right-shifting by bits(a)-1 smears the sign bit across all
404  // of `a`.
405  return OT_UNSIGNED(a >> (sizeof(a) * 8 - 1));
406 }
407 
408 /**
409  * Performs constant-time unsigned ascending comparison.
410  *
411  * Returns `a < b` as a constant-time boolean.
412  *
413  * @return `a < b`.
414  */
416 inline ct_bool32_t ct_sltu32(uint32_t a, uint32_t b) {
417  // Proof. See Hacker's Delight page 23.
418  return ct_sltz32(OT_SIGNED(((a & ~b) | ((a ^ ~b) & (a - b)))));
419 }
420 
421 /**
422  * Performs constant-time zero equality.
423  *
424  * Returns `a == 0` as a constant-time boolean.
425  *
426  * @return `a == 0`.
427  */
429 inline ct_bool32_t ct_seqz32(uint32_t a) {
430  // Proof. See Hacker's Delight page 23.
431  // HD gives this formula: `a == b := ~(a-b | b-a)`.
432  //
433  // Setting `b` to zero gives us
434  // ~(a | -a) -> ~a & ~-a -> ~a & (a - 1)
435  // via identities on page 16.
436  //
437  // This forumula is also given on page 11 for a different purpose.
438  return ct_sltz32(OT_SIGNED(~a & (a - 1)));
439 }
440 
441 /**
442  * Performs constant-time equality.
443  *
444  * Returns `a == b` as a constant-time boolean.
445  *
446  * @return `a == b`.
447  */
449 inline ct_bool32_t ct_seq32(uint32_t a, uint32_t b) {
450  // Proof. a ^ b == 0 -> a ^ a ^ b == a ^ 0 -> b == a.
451  return ct_seqz32(a ^ b);
452 }
453 
454 /**
455  * Performs a constant-time select.
456  *
457  * Returns `a` if `c` is true; otherwise, returns `b`.
458  *
459  * This function should be used with one of the comparison functions above; do
460  * NOT create `c` using an `if` or `?:` operation.
461  *
462  * @param c The condition to test.
463  * @param a The value to return on true.
464  * @param b The value to return on false.
465  * @return `c ? a : b`.
466  */
468 inline uint32_t ct_cmov32(ct_bool32_t c, uint32_t a, uint32_t b) {
469  // Proof. See Hacker's Delight page 46. HD gives this as a branchless swap;
470  // branchless select is a special case of that.
471 
472  // `c` must be laundered because LLVM has a strength reduction pass for this
473  // exact pattern, but lacking a cmov instruction, it will almost certainly
474  // select a branch instruction here.
475  return (launder32(c) & a) | (launder32(~c) & b);
476 }
477 
478 /**
479  * A constant-time, pointer-sized boolean value.
480  *
481  * Values of this type MUST be either all zero bits or all one bits.
482  */
483 typedef uintptr_t ct_boolw_t;
484 
485 /**
486  * Performs constant-time signed comparison to zero.
487  *
488  * Returns whether `a < 0`, as a constant-time boolean.
489  * In other words, this checks if `a` is negative, i.e., it's sign bit is set.
490  *
491  * @return `a < 0`.
492  */
494 inline ct_boolw_t ct_sltzw(intptr_t a) {
495  return OT_UNSIGNED(a >> (sizeof(a) * 8 - 1));
496 }
497 
498 /**
499  * Performs constant-time unsigned ascending comparison.
500  *
501  * Returns `a < b` as a constant-time boolean.
502  *
503  * @return `a < b`.
504  */
506 inline ct_boolw_t ct_sltuw(uintptr_t a, uintptr_t b) {
507  return ct_sltzw(OT_SIGNED((a & ~b) | ((a ^ ~b) & (a - b))));
508 }
509 
510 /**
511  * Performs constant-time zero equality.
512  *
513  * Returns `a == 0` as a constant-time boolean.
514  *
515  * @return `a == 0`.
516  */
518 inline ct_boolw_t ct_seqzw(uintptr_t a) {
519  return ct_sltzw(OT_SIGNED(~a & (a - 1)));
520 }
521 
522 /**
523  * Performs constant-time equality.
524  *
525  * Returns `a == b` as a constant-time boolean.
526  *
527  * @return `a == b`.
528  */
530 inline ct_boolw_t ct_seqw(uintptr_t a, uintptr_t b) { return ct_seqzw(a ^ b); }
531 
532 /**
533  * Performs a constant-time select.
534  *
535  * Returns `a` if `c` is true; otherwise, returns `b`.
536  *
537  * This function should be used with one of the comparison functions above; do
538  * NOT create `c` using an `if` or `?:` operation.
539  *
540  * @param c The condition to test.
541  * @param a The value to return on true.
542  * @param b The value to return on false.
543  * @return `c ? a : b`.
544  */
546 inline uintptr_t ct_cmovw(ct_boolw_t c, uintptr_t a, uintptr_t b) {
547  return (launderw(c) & a) | (launderw(~c) & b);
548 }
549 
550 // Implementation details shared across shutdown macros.
551 #ifdef OT_PLATFORM_RV32
552 // This string can be tuned to be longer or shorter as desired, for
553 // fault-hardening purposes.
554 #define HARDENED_UNIMP_SEQUENCE_() "unimp; unimp; unimp;"
555 
556 #define HARDENED_CHECK_OP_EQ_ "beq"
557 #define HARDENED_CHECK_OP_NE_ "bne"
558 #define HARDENED_CHECK_OP_LT_ "bltu"
559 #define HARDENED_CHECK_OP_GT_ "bgtu"
560 #define HARDENED_CHECK_OP_LE_ "bleu"
561 #define HARDENED_CHECK_OP_GE_ "bgeu"
562 
563 #ifndef OT_DISABLE_HARDENING
564 // clang-format off
565 #define HARDENED_CHECK_(op_, a_, b_) \
566  asm volatile( \
567  op_ " %0, %1, .L_HARDENED_%=;" \
568  HARDENED_UNIMP_SEQUENCE_() \
569  ".L_HARDENED_%=:;" \
570  ::"r"(a_), "r"(b_))
571 // clang-format on
572 
573 #define HARDENED_TRAP_() \
574  do { \
575  asm volatile(HARDENED_UNIMP_SEQUENCE_()); \
576  } while (false)
577 
578 #else // OT_DISABLE_HARDENING
579 // We allow disabling hardening to measure the impact of the hardened sequences
580 // on code size.
581 #define HARDENED_CHECK_(op_, a_, b_) \
582  do { \
583  (void)(a_); \
584  (void)(b_); \
585  } while (0)
586 #define HARDENED_TRAP_() \
587  do { \
588  } while (0)
589 #endif // OT_DISABLE_HARDENING
590 #else // OT_PLATFORM_RV32
591 #include <assert.h>
592 
593 #define HARDENED_CHECK_OP_EQ_ ==
594 #define HARDENED_CHECK_OP_NE_ !=
595 #define HARDENED_CHECK_OP_LT_ <
596 #define HARDENED_CHECK_OP_GT_ >
597 #define HARDENED_CHECK_OP_LE_ <=
598 #define HARDENED_CHECK_OP_GE_ >=
599 
600 #define HARDENED_CHECK_(op_, a_, b_) assert((uint64_t)(a_)op_(uint64_t)(b_))
601 
602 #define HARDENED_TRAP_() __builtin_trap()
603 #endif // OT_PLATFORM_RV32
604 
605 /**
606  * If the following code is (unexpectedly) reached a trap instruction will be
607  * executed.
608  */
609 #define HARDENED_TRAP() HARDENED_TRAP_()
610 
611 /**
612  * Compare two values in a way that is *manifestly* true: that is, under normal
613  * program execution, the comparison is a tautology.
614  *
615  * If the comparison fails, we can deduce the device is under attack, so an
616  * illegal instruction will be executed. The manner in which the comparison is
617  * done is carefully constructed in assembly to minimize the possibility of
618  * adversarial faults. This macro also implicitly launders both arguments since
619  * the compiler doesn't see the comparison and can't learn any new information
620  * from it.
621  *
622  * There are six variants of this macro: one for each of the six comparison
623  * operations.
624  *
625  * This macro is intended to be used along with `launder32()` to handle detected
626  * faults when implementing redundant checks, i.e. in places where the compiler
627  * could otherwise prove statically unreachable. For example:
628  * ```
629  * if (launder32(x) == 0) {
630  * HARDENED_CHECK_EQ(x, 0);
631  * }
632  * ```
633  * See `launder32()` for more details.
634  */
635 #define HARDENED_CHECK_EQ(a_, b_) HARDENED_CHECK_(HARDENED_CHECK_OP_EQ_, a_, b_)
636 #define HARDENED_CHECK_NE(a_, b_) HARDENED_CHECK_(HARDENED_CHECK_OP_NE_, a_, b_)
637 #define HARDENED_CHECK_LT(a_, b_) HARDENED_CHECK_(HARDENED_CHECK_OP_LT_, a_, b_)
638 #define HARDENED_CHECK_GT(a_, b_) HARDENED_CHECK_(HARDENED_CHECK_OP_GT_, a_, b_)
639 #define HARDENED_CHECK_LE(a_, b_) HARDENED_CHECK_(HARDENED_CHECK_OP_LE_, a_, b_)
640 #define HARDENED_CHECK_GE(a_, b_) HARDENED_CHECK_(HARDENED_CHECK_OP_GE_, a_, b_)
641 
642 #ifdef __cplusplus
643 }
644 #endif // __cplusplus
645 
646 #endif // OPENTITAN_SW_DEVICE_LIB_BASE_HARDENED_H_