Software APIs
ghash.c
1 // Copyright lowRISC contributors (OpenTitan project).
2 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
3 // SPDX-License-Identifier: Apache-2.0
4 
5 #include "sw/device/lib/crypto/impl/aes_gcm/ghash.h"
6 
9 
10 // Module ID for status codes.
11 #define MODULE_ID MAKE_MODULE_ID('g', 'h', 'a')
12 
13 enum {
14  /**
15  * Log2 of the number of bytes in an AES block.
16  */
17  kGhashBlockLog2NumBytes = 4,
18  /**
19  * Number of windows for Galois field pre-computed tables.
20  *
21  * We suse 4-bit windows, so the number of windows is 2x the number of bytes
22  * in a block.
23  */
24  kNumWindows = kGhashBlockNumBytes << 1,
25 };
26 static_assert(kGhashBlockNumBytes == (1 << kGhashBlockLog2NumBytes),
27  "kGhashBlockLog2NumBytes does not match kGhashBlockNumBytes");
28 
29 /**
30  * Precomputed modular reduction constants for Galois field multiplication.
31  *
32  * This implementation uses 4-bit windows. The bytes here represent 12-bit
33  * little-endian values.
34  *
35  * The entry with index i in this table is equal to i * 0xe1, where the bytes i
36  * and 0xe1 are interpreted as polynomials in the GCM Galois field. For
37  * example, 0xe1 = 0b1110_0001 is the degree-8 polynomial x^8 + x^2 + x + 1.
38  *
39  * The field modulus for GCM is 2^128 + x^8 + x^2 + x + 1. Therefore, if a
40  * field element is shifted, it can be reduced by multiplying the high bits
41  * (128 and above) by 0xe1 and adding them to the low bits.
42  *
43  * There is a size/speed tradeoff in window size. For 8-bit windows, GHASH
44  * becomes significantly faster, but the overhead for computing the product
45  * table of a given hash subkey becomes higher, so larger windows are slower
46  * for smaller inputs but faster for large inputs.
47  */
48 static const uint16_t kGFReduceTable[16] = {
49  0x0000, 0x201c, 0x4038, 0x6024, 0x8070, 0xa06c, 0xc048, 0xe054,
50  0x00e1, 0x20fd, 0x40d9, 0x60c5, 0x8091, 0xa08d, 0xc0a9, 0xe0b5};
51 
52 /**
53  * Performs a bitwise XOR of two blocks.
54  *
55  * This operation corresponds to addition in the Galois field.
56  *
57  * @param x First operand block
58  * @param y Second operand block
59  * @param[out] out Buffer in which to store output; can be the same as one or
60  * both operands.
61  */
62 static inline void block_xor(const ghash_block_t *x, const ghash_block_t *y,
63  ghash_block_t *out) {
64  for (size_t i = 0; i < kGhashBlockNumWords; ++i) {
65  out->data[i] = x->data[i] ^ y->data[i];
66  }
67 }
68 
69 /**
70  * Logical right shift of an AES block.
71  *
72  * NIST represents AES blocks as big-endian, and a "right shift" in that
73  * representation means shifting out the LSB. However, the processor is
74  * little-endian, so we need to reverse the bytes before shifting.
75  *
76  * @param block Input AES block, modified in-place
77  * @param nbits Number of bits to shift
78  */
79 static inline void block_shiftr(ghash_block_t *block, size_t nbits) {
80  // To perform a "right shift" with respect to the big-endian block
81  // representation, we traverse the words of the block and, for each word,
82  // reverse the bytes, save the new LSB as `carry`, shift right, set the MSB
83  // to the last block's carry, and then reverse the bytes back.
84  uint32_t carry = 0;
85  for (size_t i = 0; i < kGhashBlockNumWords; ++i) {
86  uint32_t rev = __builtin_bswap32(block->data[i]);
87  uint32_t next_carry = rev & ((1 << nbits) - 1);
88  rev >>= nbits;
89  rev |= carry << (32 - nbits);
90  carry = next_carry;
91  block->data[i] = __builtin_bswap32(rev);
92  }
93 }
94 
95 /**
96  * Retrieve the byte at a given index from the block.
97  *
98  * @param block Input cipher block
99  * @param index Index of byte to retrieve (must be < `kGhashBlockNumBytes`)
100  * @return Value of block[index]
101  */
102 static inline uint8_t block_byte_get(const ghash_block_t *block, size_t index) {
103  return ((char *)block->data)[index];
104 }
105 
106 /**
107  * Multiply an element of the GCM Galois field by the polynomial `x`.
108  *
109  * This corresponds to a shift right in the bit representation, and then
110  * reduction with the field modulus.
111  *
112  * Runs in constant time.
113  *
114  * @param p Polynomial to be multiplied
115  * @param[out] out Buffer for output
116  */
117 static inline void galois_mulx(const ghash_block_t *p, ghash_block_t *out) {
118  // Get the very rightmost bit of the input block (coefficient of x^127).
119  uint8_t p127 = block_byte_get(p, kGhashBlockNumBytes - 1) & 1;
120  // Set the output to p >> 1.
121  memcpy(out->data, p->data, kGhashBlockNumBytes);
122  block_shiftr(out, 1);
123  // If the highest coefficient was 1, then subtract the polynomial that
124  // corresponds to (modulus - 2^128).
125  uint32_t mask = 0 - p127;
126  out->data[0] ^= (0xe1 & mask);
127 }
128 
129 /**
130  * Reverse the bits of a 4-bit number.
131  *
132  * @param byte Input byte.
133  * @return byte with lower 4 bits reversed and upper 4 unmodified.
134  */
135 static uint8_t reverse_bits(uint8_t byte) {
136  /* TODO: replace with rev.n (from 0.93 draft of bitmanip) once bitmanip
137  * extension is enabled. */
138  uint8_t out = 0;
139  for (size_t i = 0; i < 4; ++i) {
140  out <<= 1;
141  out |= (byte >> i) & 1;
142  }
143  return out;
144 }
145 
146 void ghash_init_subkey(const uint32_t *hash_subkey, ghash_context_t *ctx) {
147  // Initialize 0 * H = 0.
148  memset(ctx->tbl[0].data, 0, kGhashBlockNumBytes);
149  // Initialize 1 * H = H.
150  memcpy(ctx->tbl[0x8].data, hash_subkey, kGhashBlockNumBytes);
151 
152  // To get remaining entries, we use a variant of "shift and add"; in
153  // polynomial terms, a shift is a multiplication by x. Note that, because the
154  // processor represents bytes with the MSB on the left and NIST uses a fully
155  // little-endian polynomial representation with the MSB on the right, we have
156  // to reverse the bits of the indices.
157  for (uint8_t i = 2; i < 16; i += 2) {
158  // Find the product corresponding to (i >> 1) * H and multiply by x to
159  // shift 1; this will be i * H.
160  galois_mulx(&ctx->tbl[reverse_bits(i >> 1)], &ctx->tbl[reverse_bits(i)]);
161 
162  // Add H to i * H to get (i + 1) * H.
163  block_xor(&ctx->tbl[reverse_bits(i)], &ctx->tbl[0x8],
164  &ctx->tbl[reverse_bits(i + 1)]);
165  }
166 }
167 
168 void ghash_init(ghash_context_t *ctx) {
169  memset(ctx->state.data, 0, kGhashBlockNumBytes);
170 }
171 
172 /**
173  * Multiply the GHASH state by the hash subkey.
174  *
175  * See NIST SP800-38D, section 6.3.
176  *
177  * The NIST documentation shows a very slow double-and-add algorithm, which
178  * iterates through the first operand bit-by-bit. However, since the second
179  * operand is always the hash subkey H, we can speed things up significantly
180  * with precomputed tables.
181  *
182  * This operation corresponds to multiplication in the Galois field with order
183  * 2^128, modulo the polynomial x^128 + x^8 + x^2 + x + 1
184  *
185  * @param ctx GHASH context, updated in place.
186  */
187 static void galois_mul_state_key(ghash_context_t *ctx) {
188  // Initialize the multiplication result to 0.
189  ghash_block_t result;
190  memset(result.data, 0, kGhashBlockNumBytes);
191 
192  // To compute the product, we iterate through the bytes of the input block,
193  // considering the most significant (in polynomial terms) first. For each
194  // byte b, we:
195  // * multiply `result` by x^8 (shift all coefficients to the right)
196  // * reduce the shifted `result` modulo the field modulus
197  // * look up the product `b * H` and add it to `result`
198  //
199  // We can skip the shift and reduce steps on the first iteration, since
200  // `result` is 0.
201  for (size_t i = 0; i < kNumWindows; ++i) {
202  if (i != 0) {
203  // Save the most significant half-byte of `result` before shifting.
204  uint8_t overflow =
205  block_byte_get(&result, kGhashBlockNumBytes - 1) & 0x0f;
206  // Shift `result` to the right, discarding high bits.
207  block_shiftr(&result, 4);
208  // Look up the product of `overflow` and the low terms of the modulus in
209  // the precomputed table.
210  uint16_t reduce_term = kGFReduceTable[overflow];
211  // Add (xor) this product to the low bits to complete modular reduction.
212  // This works because (low + x^128 * high) is equivalent to (low + (x^128
213  // - modulus) * high).
214  result.data[0] ^= reduce_term;
215  }
216 
217  // Add the product of the next window and H to `result`. We process the
218  // windows starting with the most significant polynomial terms, which means
219  // starting from the last byte and proceeding to the first.
220  uint8_t tbl_index = block_byte_get(&ctx->state, (kNumWindows - 1 - i) >> 1);
221 
222  // Select the less significant 4 bits if i is even, or the more significant
223  // 4 bits if i is odd. This does not need to be constant time, since the
224  // values of i in this loop are constant.
225  if ((i & 1) == 1) {
226  tbl_index >>= 4;
227  } else {
228  tbl_index &= 0x0f;
229  }
230  block_xor(&result, &ctx->tbl[tbl_index], &result);
231  }
232 
233  memcpy(ctx->state.data, result.data, kGhashBlockNumBytes);
234 }
235 
236 /**
237  * Single-block update function for GHASH.
238  *
239  * @param ctx GHASH context.
240  * @param block Block to incorporate.
241  */
242 static void ghash_process_block(ghash_context_t *ctx, ghash_block_t *block) {
243  // XOR `state` with the next input block.
244  block_xor(&ctx->state, block, &ctx->state);
245  // Multiply state by H in-place.
246  galois_mul_state_key(ctx);
247 }
248 
249 void ghash_process_full_blocks(ghash_context_t *ctx, size_t partial_len,
250  ghash_block_t *partial, size_t input_len,
251  const uint8_t *input) {
252  if (input_len < kGhashBlockNumBytes - partial_len) {
253  // Not enough data for a full block; copy into the partial block.
254  unsigned char *partial_bytes = (unsigned char *)partial->data;
255  memcpy(partial_bytes + partial_len, input, input_len);
256  } else {
257  // Construct a block from the partial data and the start of the new data.
258  unsigned char *partial_bytes = (unsigned char *)partial->data;
259  memcpy(partial_bytes + partial_len, input,
260  kGhashBlockNumBytes - partial_len);
261  input += kGhashBlockNumBytes - partial_len;
262  input_len -= kGhashBlockNumBytes - partial_len;
263 
264  // Process the block.
265  ghash_process_block(ctx, partial);
266 
267  // Process any remaining full blocks of input.
268  while (input_len >= kGhashBlockNumBytes) {
269  memcpy(partial->data, input, kGhashBlockNumBytes);
270  ghash_process_block(ctx, partial);
271  input += kGhashBlockNumBytes;
272  input_len -= kGhashBlockNumBytes;
273  }
274 
275  // Copy any remaining input into the partial block.
276  memcpy(partial->data, input, input_len);
277  }
278 }
279 
280 void ghash_update(ghash_context_t *ctx, size_t input_len,
281  const uint8_t *input) {
282  // Process all full blocks and write the remaining non-full data into
283  // `partial`.
284  ghash_block_t partial = {.data = {0}};
285  ghash_process_full_blocks(ctx, 0, &partial, input_len, input);
286 
287  // Check if there is data remaining, and process it if so.
288  size_t partial_len = input_len % kGhashBlockNumBytes;
289  if (partial_len != 0) {
290  unsigned char *partial_bytes = (unsigned char *)partial.data;
291  memset(partial_bytes + partial_len, 0, kGhashBlockNumBytes - partial_len);
292  ghash_process_block(ctx, &partial);
293  }
294 }
295 
296 void ghash_final(ghash_context_t *ctx, uint32_t *result) {
297  memcpy(result, ctx->state.data, kGhashBlockNumBytes);
298 }