1 // SPDX-License-Identifier: BSD-3-Clause 2 /* ========================================================================== 3 * ieee754.c -- floating-point conversion between half, double & single-precision 4 * 5 * Copyright (c) 2018-2024, Laurence Lundblade. All rights reserved. 6 * Copyright (c) 2021, Arm Limited. All rights reserved. 7 * 8 * SPDX-License-Identifier: BSD-3-Clause 9 * 10 * See BSD-3-Clause license in README.md 11 * 12 * Created on 7/23/18 13 * ========================================================================== */ 14 15 /* 16 * Include before QCBOR_DISABLE_PREFERRED_FLOAT is checked as 17 * QCBOR_DISABLE_PREFERRED_FLOAT might be defined in qcbor/qcbor_common.h 18 */ 19 #include "qcbor/qcbor_common.h" 20 21 #ifndef QCBOR_DISABLE_PREFERRED_FLOAT 22 23 #include "ieee754.h" 24 #include <string.h> /* For memcpy() */ 25 26 27 /* 28 * This code has long lines and is easier to read because of 29 * them. Some coding guidelines prefer 80 column lines (can they not 30 * afford big displays?). 31 * 32 * This code works solely using shifts and masks and thus has no 33 * dependency on any math libraries. It can even work if the CPU 34 * doesn't have any floating-point support, though that isn't the most 35 * useful thing to do. 36 * 37 * The memcpy() dependency is only for CopyFloatToUint32() and friends 38 * which only is needed to avoid type punning when converting the 39 * actual float bits to an unsigned value so the bit shifts and masks 40 * can work. 41 * 42 * The references used to write this code: 43 * 44 * IEEE 754-2008, particularly section 3.6 and 6.2.1 45 * 46 * https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages 47 * 48 * https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values 49 * 50 * https://stackoverflow.com/questions/46073295/implicit-type-promotion-rules 51 * 52 * https://stackoverflow.com/questions/589575/what-does-the-c-standard-state-the-size-of-int-long-type-to-be 53 * 54 * IEEE754_FloatToDouble(uint32_t uFloat) was created but is not 55 * needed. It can be retrieved from github history if needed. 56 */ 57 58 59 60 61 /* ----- Half Precsion ----------- */ 62 #define HALF_NUM_SIGNIFICAND_BITS (10) 63 #define HALF_NUM_EXPONENT_BITS (5) 64 #define HALF_NUM_SIGN_BITS (1) 65 66 #define HALF_SIGNIFICAND_SHIFT (0) 67 #define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS) 68 #define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS) 69 70 #define HALF_SIGNIFICAND_MASK (0x3ffU) // The lower 10 bits 71 #define HALF_EXPONENT_MASK (0x1fU << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent 72 #define HALF_SIGN_MASK (0x01U << HALF_SIGN_SHIFT) // 0x8000 1 bit of sign 73 #define HALF_QUIET_NAN_BIT (0x01U << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200 74 75 /* Biased Biased Unbiased Use 76 * 0x00 0 -15 0 and subnormal 77 * 0x01 1 -14 Smallest normal exponent 78 * 0x1e 30 15 Largest normal exponent 79 * 0x1F 31 16 NaN and Infinity */ 80 #define HALF_EXPONENT_BIAS (15) 81 #define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased 82 #define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased 83 #define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased 84 #define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased 85 86 87 /* ------ Single-Precision -------- */ 88 #define SINGLE_NUM_SIGNIFICAND_BITS (23) 89 #define SINGLE_NUM_EXPONENT_BITS (8) 90 #define SINGLE_NUM_SIGN_BITS (1) 91 92 #define SINGLE_SIGNIFICAND_SHIFT (0) 93 #define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS) 94 #define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS) 95 96 #define SINGLE_SIGNIFICAND_MASK (0x7fffffU) // The lower 23 bits 97 #define SINGLE_EXPONENT_MASK (0xffU << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent 98 #define SINGLE_SIGN_MASK (0x01U << SINGLE_SIGN_SHIFT) // 1 bit of sign 99 #define SINGLE_QUIET_NAN_BIT (0x01U << (SINGLE_NUM_SIGNIFICAND_BITS-1)) 100 101 /* Biased Biased Unbiased Use 102 * 0x0000 0 -127 0 and subnormal 103 * 0x0001 1 -126 Smallest normal exponent 104 * 0x7f 127 0 1 105 * 0xfe 254 127 Largest normal exponent 106 * 0xff 255 128 NaN and Infinity */ 107 #define SINGLE_EXPONENT_BIAS (127) 108 #define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS) 109 #define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1) 110 #define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS) 111 #define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1) 112 113 114 /* --------- Double-Precision ---------- */ 115 #define DOUBLE_NUM_SIGNIFICAND_BITS (52) 116 #define DOUBLE_NUM_EXPONENT_BITS (11) 117 #define DOUBLE_NUM_SIGN_BITS (1) 118 119 #define DOUBLE_SIGNIFICAND_SHIFT (0) 120 #define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS) 121 #define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS) 122 123 #define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits 124 #define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent 125 #define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign 126 #define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1)) 127 128 129 /* Biased Biased Unbiased Use 130 * 0x00000000 0 -1023 0 and subnormal 131 * 0x00000001 1 -1022 Smallest normal exponent 132 * 0x000007fe 2046 1023 Largest normal exponent 133 * 0x000007ff 2047 1024 NaN and Infinity */ 134 #define DOUBLE_EXPONENT_BIAS (1023) 135 #define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS) 136 #define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1) 137 #define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS) 138 #define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1) 139 140 141 142 143 /* 144 * Convenient functions to avoid type punning, compiler warnings and 145 * such. The optimizer reduces them to a simple assignment. This is a 146 * crusty corner of C. It shouldn't be this hard. 147 * 148 * These are also in UsefulBuf.h under a different name. They are copied 149 * here to avoid a dependency on UsefulBuf.h. There is no object code 150 * size impact because these always optimze down to a simple assignment. 151 */ 152 static inline uint32_t 153 CopyFloatToUint32(float f) 154 { 155 uint32_t u32; 156 memcpy(&u32, &f, sizeof(uint32_t)); 157 return u32; 158 } 159 160 static inline uint64_t 161 CopyDoubleToUint64(double d) 162 { 163 uint64_t u64; 164 memcpy(&u64, &d, sizeof(uint64_t)); 165 return u64; 166 } 167 168 static inline double 169 CopyUint64ToDouble(uint64_t u64) 170 { 171 double d; 172 memcpy(&d, &u64, sizeof(uint64_t)); 173 return d; 174 } 175 176 static inline float 177 CopyUint32ToSingle(uint32_t u32) 178 { 179 float f; 180 memcpy(&f, &u32, sizeof(uint32_t)); 181 return f; 182 } 183 184 185 186 187 /** 188 * @brief Assemble sign, significand and exponent into single precision float. 189 * 190 * @param[in] uDoubleSign 0 if positive, 1 if negative 191 * @pararm[in] uDoubleSignificand Bits of the significand 192 * @param[in] nDoubleUnBiasedExponent Exponent 193 * 194 * This returns the bits for a single-precision float, a binary64 195 * as specified in IEEE754. 196 */ 197 static double 198 IEEE754_AssembleDouble(uint64_t uDoubleSign, 199 uint64_t uDoubleSignificand, 200 int64_t nDoubleUnBiasedExponent) 201 { 202 uint64_t uDoubleBiasedExponent; 203 204 uDoubleBiasedExponent = (uint64_t)(nDoubleUnBiasedExponent + DOUBLE_EXPONENT_BIAS); 205 206 return CopyUint64ToDouble(uDoubleSignificand | 207 (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) | 208 (uDoubleSign << DOUBLE_SIGN_SHIFT)); 209 } 210 211 212 double 213 IEEE754_HalfToDouble(uint16_t uHalfPrecision) 214 { 215 uint64_t uDoubleSignificand; 216 int64_t nDoubleUnBiasedExponent; 217 double dResult; 218 219 /* Pull out the three parts of the half-precision float. Do all 220 * the work in 64 bits because that is what the end result is. It 221 * may give smaller code size and will keep static analyzers 222 * happier. 223 */ 224 const uint64_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK; 225 const uint64_t uHalfBiasedExponent = (uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT; 226 const int64_t nHalfUnBiasedExponent = (int64_t)uHalfBiasedExponent - HALF_EXPONENT_BIAS; 227 const uint64_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT; 228 229 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) { 230 /* 0 or subnormal */ 231 if(uHalfSignificand) { 232 /* --- SUBNORMAL --- */ 233 /* A half-precision subnormal can always be converted to a 234 * normal double-precision float because the ranges line up. 235 * The exponent of a subnormal starts out at the min exponent 236 * for a normal. As the sub normal significand bits are 237 * shifted, left to normalize, the exponent is 238 * decremented. Shifting continues until fully normalized. 239 */ 240 nDoubleUnBiasedExponent = HALF_EXPONENT_MIN; 241 uDoubleSignificand = uHalfSignificand; 242 do { 243 uDoubleSignificand <<= 1; 244 nDoubleUnBiasedExponent--; 245 } while ((uDoubleSignificand & (1ULL << HALF_NUM_SIGNIFICAND_BITS)) == 0); 246 /* A normal has an implied 1 in the most significant 247 * position that a subnormal doesn't. */ 248 uDoubleSignificand -= 1ULL << HALF_NUM_SIGNIFICAND_BITS; 249 /* Must shift into place for a double significand */ 250 uDoubleSignificand <<= DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS; 251 252 dResult = IEEE754_AssembleDouble(uHalfSign, 253 uDoubleSignificand, 254 nDoubleUnBiasedExponent); 255 } else { 256 /* --- ZERO --- */ 257 dResult = IEEE754_AssembleDouble(uHalfSign, 258 0, 259 DOUBLE_EXPONENT_ZERO); 260 } 261 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) { 262 /* NaN or Inifinity */ 263 if(uHalfSignificand) { 264 /* --- NaN --- */ 265 /* Half-precision payloads always fit into double precision 266 * payloads. They are shifted left the same as a normal 267 * number significand. 268 */ 269 uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); 270 dResult = IEEE754_AssembleDouble(uHalfSign, 271 uDoubleSignificand, 272 DOUBLE_EXPONENT_INF_OR_NAN); 273 } else { 274 /* --- INFINITY --- */ 275 dResult = IEEE754_AssembleDouble(uHalfSign, 276 0, 277 DOUBLE_EXPONENT_INF_OR_NAN); 278 } 279 } else { 280 /* --- NORMAL NUMBER --- */ 281 uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); 282 dResult = IEEE754_AssembleDouble(uHalfSign, 283 uDoubleSignificand, 284 nHalfUnBiasedExponent); 285 } 286 287 return dResult; 288 } 289 290 291 /** 292 * @brief Assemble sign, significand and exponent into single precision float. 293 * 294 * @param[in] uHalfSign 0 if positive, 1 if negative 295 * @pararm[in] uHalfSignificand Bits of the significand 296 * @param[in] nHalfUnBiasedExponent Exponent 297 * 298 * This returns the bits for a single-precision float, a binary32 as 299 * specified in IEEE754. It is returned as a uint64_t rather than a 300 * uint32_t or a float for convenience of usage. 301 */ 302 static uint32_t 303 IEEE754_AssembleHalf(uint32_t uHalfSign, 304 uint32_t uHalfSignificand, 305 int32_t nHalfUnBiasedExponent) 306 { 307 uint32_t uHalfUnbiasedExponent; 308 309 uHalfUnbiasedExponent = (uint32_t)(nHalfUnBiasedExponent + HALF_EXPONENT_BIAS); 310 311 return uHalfSignificand | 312 (uHalfUnbiasedExponent << HALF_EXPONENT_SHIFT) | 313 (uHalfSign << HALF_SIGN_SHIFT); 314 } 315 316 317 /* Public function; see ieee754.h */ 318 IEEE754_union 319 IEEE754_SingleToHalf(float f) 320 { 321 IEEE754_union result; 322 uint32_t uDroppedBits; 323 int32_t nExponentDifference; 324 int32_t nShiftAmount; 325 uint32_t uHalfSignificand; 326 327 /* Pull the three parts out of the double-precision float Most work 328 * is done with uint32_t which helps avoid integer promotions and 329 * static analyzer complaints. 330 */ 331 const uint32_t uSingle = CopyFloatToUint32(f); 332 const uint32_t uSingleBiasedExponent = (uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT; 333 const int32_t nSingleUnbiasedExponent = (int32_t)uSingleBiasedExponent - SINGLE_EXPONENT_BIAS; 334 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK; 335 const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT; 336 337 if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) { 338 if(uSingleSignificand == 0) { 339 /* --- IS ZERO --- */ 340 result.uSize = IEEE754_UNION_IS_HALF; 341 result.uValue = IEEE754_AssembleHalf(uSingleSign, 342 0, 343 HALF_EXPONENT_ZERO); 344 } else { 345 /* --- IS SINGLE SUBNORMAL --- */ 346 /* The largest single subnormal is slightly less than the 347 * largest single normal which is 2^-149 or 348 * 2.2040517676619426e-38. The smallest half subnormal is 349 * 2^-14 or 5.9604644775390625E-8. There is no overlap so 350 * single subnormals can't be converted to halfs of any sort. 351 */ 352 result.uSize = IEEE754_UNION_IS_SINGLE; 353 result.uValue = uSingle; 354 } 355 } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) { 356 if(uSingleSignificand == 0) { 357 /* ---- IS INFINITY ---- */ 358 result.uSize = IEEE754_UNION_IS_HALF; 359 result.uValue = IEEE754_AssembleHalf(uSingleSign, 0, HALF_EXPONENT_INF_OR_NAN); 360 } else { 361 /* The NaN can only be converted if no payload bits are lost 362 * per RFC 8949 section 4.1 that defines Preferred 363 * Serializaton. Note that Deterministically Encode CBOR in 364 * section 4.2 allows for some variation of this rule, but at 365 * the moment this implementation is of Preferred 366 * Serialization, not CDE. As of December 2023, we are also 367 * expecting an update to CDE. This code may need to be 368 * updated for CDE. 369 */ 370 uDroppedBits = uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS); 371 if(uDroppedBits == 0) { 372 /* --- IS CONVERTABLE NAN --- */ 373 uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); 374 result.uSize = IEEE754_UNION_IS_HALF; 375 result.uValue = IEEE754_AssembleHalf(uSingleSign, 376 uHalfSignificand, 377 HALF_EXPONENT_INF_OR_NAN); 378 379 } else { 380 /* --- IS UNCONVERTABLE NAN --- */ 381 result.uSize = IEEE754_UNION_IS_SINGLE; 382 result.uValue = uSingle; 383 } 384 } 385 } else { 386 /* ---- REGULAR NUMBER ---- */ 387 /* A regular single can be converted to a regular half if the 388 * single's exponent is in the smaller range of a half and if no 389 * precision is lost in the significand. 390 */ 391 if(nSingleUnbiasedExponent >= HALF_EXPONENT_MIN && 392 nSingleUnbiasedExponent <= HALF_EXPONENT_MAX && 393 (uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS)) == 0) { 394 uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); 395 396 /* --- CONVERT TO HALF NORMAL --- */ 397 result.uSize = IEEE754_UNION_IS_HALF; 398 result.uValue = IEEE754_AssembleHalf(uSingleSign, 399 uHalfSignificand, 400 nSingleUnbiasedExponent); 401 } else { 402 /* Unable to convert to a half normal. See if it can be 403 * converted to a half subnormal. To do that, the exponent 404 * must be in range and no precision can be lost in the 405 * signficand. 406 * 407 * This is more complicated because the number is not 408 * normalized. The signficand must be shifted proprotionally 409 * to the exponent and 1 must be added in. See 410 * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding 411 * 412 * Exponents -14 to -24 map to a shift of 0 to 10 of the 413 * significand. The largest value of a half subnormal has an 414 * exponent of -14. Subnormals are not normalized like 415 * normals meaning they lose precision as the numbers get 416 * smaller. Normals don't lose precision because the exponent 417 * allows all the bits of the significand to be significant. 418 */ 419 /* The exponent of the largest possible half-precision 420 * subnormal is HALF_EXPONENT_MIN (-14). Exponents larger 421 * than this are normal and handled above. We're going to 422 * shift the significand right by at least this amount. 423 */ 424 nExponentDifference = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN); 425 426 /* In addition to the shift based on the exponent's value, 427 * the single significand has to be shifted right to fit into 428 * a half-precision significand */ 429 nShiftAmount = nExponentDifference + (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS); 430 431 /* Must add 1 in to the possible significand because there is 432 * an implied 1 for normal values and not for subnormal 433 * values. See equations here: 434 * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding 435 */ 436 uHalfSignificand = (uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount; 437 438 /* If only zero bits get shifted out, this can be converted 439 * to subnormal */ 440 if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN && 441 nSingleUnbiasedExponent >= HALF_EXPONENT_MIN - HALF_NUM_SIGNIFICAND_BITS && 442 uHalfSignificand << nShiftAmount == uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) { 443 /* --- CONVERTABLE TO HALF SUBNORMAL --- */ 444 result.uSize = IEEE754_UNION_IS_HALF; 445 result.uValue = IEEE754_AssembleHalf(uSingleSign, 446 uHalfSignificand, 447 HALF_EXPONENT_ZERO); 448 } else { 449 /* --- DO NOT CONVERT --- */ 450 result.uSize = IEEE754_UNION_IS_SINGLE; 451 result.uValue = uSingle; 452 } 453 } 454 } 455 456 return result; 457 } 458 459 460 /** 461 * @brief Assemble sign, significand and exponent into single precision float. 462 * 463 * @param[in] uSingleSign 0 if positive, 1 if negative 464 * @pararm[in] uSingleSignificand Bits of the significand 465 * @param[in] nSingleUnBiasedExponent Exponent 466 * 467 * This returns the bits for a single-precision float, a binary32 as 468 * specified in IEEE754. It is returned as a uint64_t rather than a 469 * uint32_t or a float for convenience of usage. 470 */ 471 static uint64_t 472 IEEE754_AssembleSingle(uint64_t uSingleSign, 473 uint64_t uSingleSignificand, 474 int64_t nSingleUnBiasedExponent) 475 { 476 uint64_t uSingleBiasedExponent; 477 478 uSingleBiasedExponent = (uint64_t)(nSingleUnBiasedExponent + SINGLE_EXPONENT_BIAS); 479 480 return uSingleSignificand | 481 (uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) | 482 (uSingleSign << SINGLE_SIGN_SHIFT); 483 } 484 485 486 /** 487 * @brief Convert a double-precision float to single-precision. 488 * 489 * @param[in] d The value to convert. 490 * 491 * @returns Either unconverted value or value converted to single-precision. 492 * 493 * This always succeeds. If the value cannot be converted without the 494 * loss of precision, it is not converted. 495 * 496 * This handles all subnormals and NaN payloads. 497 */ 498 static IEEE754_union 499 IEEE754_DoubleToSingle(double d) 500 { 501 IEEE754_union Result; 502 int64_t nExponentDifference; 503 int64_t nShiftAmount; 504 uint64_t uSingleSignificand; 505 uint64_t uDroppedBits; 506 507 508 /* Pull the three parts out of the double-precision float. Most 509 * work is done with uint64_t which helps avoid integer promotions 510 * and static analyzer complaints. 511 */ 512 const uint64_t uDouble = CopyDoubleToUint64(d); 513 const uint64_t uDoubleBiasedExponent = (uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT; 514 const int64_t nDoubleUnbiasedExponent = (int64_t)uDoubleBiasedExponent - DOUBLE_EXPONENT_BIAS; 515 const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT; 516 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK; 517 518 519 if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) { 520 if(uDoubleSignificand == 0) { 521 /* --- IS ZERO --- */ 522 Result.uSize = IEEE754_UNION_IS_SINGLE; 523 Result.uValue = IEEE754_AssembleSingle(uDoubleSign, 524 0, 525 SINGLE_EXPONENT_ZERO); 526 } else { 527 /* --- IS DOUBLE SUBNORMAL --- */ 528 /* The largest double subnormal is slightly less than the 529 * largest double normal which is 2^-1022 or 530 * 2.2250738585072014e-308. The smallest single subnormal 531 * is 2^-149 or 1.401298464324817e-45. There is no 532 * overlap so double subnormals can't be converted to 533 * singles of any sort. 534 */ 535 Result.uSize = IEEE754_UNION_IS_DOUBLE; 536 Result.uValue = uDouble; 537 } 538 } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) { 539 if(uDoubleSignificand == 0) { 540 /* ---- IS INFINITY ---- */ 541 Result.uSize = IEEE754_UNION_IS_SINGLE; 542 Result.uValue = IEEE754_AssembleSingle(uDoubleSign, 543 0, 544 SINGLE_EXPONENT_INF_OR_NAN); 545 } else { 546 /* The NaN can only be converted if no payload bits are 547 * lost per RFC 8949 section 4.1 that defines Preferred 548 * Serializaton. Note that Deterministically Encode CBOR 549 * in section 4.2 allows for some variation of this rule, 550 * but at the moment this implementation is of Preferred 551 * Serialization, not CDE. As of December 2023, we are 552 * also expecting an update to CDE. This code may need to 553 * be updated for CDE. 554 */ 555 uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS); 556 if(uDroppedBits == 0) { 557 /* --- IS CONVERTABLE NAN --- */ 558 uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS); 559 Result.uSize = IEEE754_UNION_IS_SINGLE; 560 Result.uValue = IEEE754_AssembleSingle(uDoubleSign, 561 uSingleSignificand, 562 SINGLE_EXPONENT_INF_OR_NAN); 563 } else { 564 /* --- IS UNCONVERTABLE NAN --- */ 565 Result.uSize = IEEE754_UNION_IS_DOUBLE; 566 Result.uValue = uDouble; 567 } 568 } 569 } else { 570 /* ---- REGULAR NUMBER ---- */ 571 /* A regular double can be converted to a regular single if 572 * the double's exponent is in the smaller range of a single 573 * and if no precision is lost in the significand. 574 */ 575 uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS); 576 if(nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN && 577 nDoubleUnbiasedExponent <= SINGLE_EXPONENT_MAX && 578 uDroppedBits == 0) { 579 /* --- IS CONVERTABLE TO SINGLE --- */ 580 uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS); 581 Result.uSize = IEEE754_UNION_IS_SINGLE; 582 Result.uValue = IEEE754_AssembleSingle(uDoubleSign, 583 uSingleSignificand, 584 nDoubleUnbiasedExponent); 585 } else { 586 /* Unable to convert to a single normal. See if it can be 587 * converted to a single subnormal. To do that, the 588 * exponent must be in range and no precision can be lost 589 * in the signficand. 590 * 591 * This is more complicated because the number is not 592 * normalized. The signficand must be shifted 593 * proprotionally to the exponent and 1 must be added 594 * in. See 595 * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding 596 */ 597 nExponentDifference = -(nDoubleUnbiasedExponent - SINGLE_EXPONENT_MIN); 598 nShiftAmount = nExponentDifference + (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS); 599 uSingleSignificand = (uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount; 600 601 if(nDoubleUnbiasedExponent < SINGLE_EXPONENT_MIN && 602 nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN - SINGLE_NUM_SIGNIFICAND_BITS && 603 uSingleSignificand << nShiftAmount == uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) { 604 /* --- IS CONVERTABLE TO SINGLE SUBNORMAL --- */ 605 Result.uSize = IEEE754_UNION_IS_SINGLE; 606 Result.uValue = IEEE754_AssembleSingle(uDoubleSign, 607 uSingleSignificand, 608 SINGLE_EXPONENT_ZERO); 609 } else { 610 /* --- CAN NOT BE CONVERTED --- */ 611 Result.uSize = IEEE754_UNION_IS_DOUBLE; 612 Result.uValue = uDouble; 613 } 614 } 615 } 616 617 return Result; 618 } 619 620 621 /* Public function; see ieee754.h */ 622 IEEE754_union 623 IEEE754_DoubleToSmaller(double d, int bAllowHalfPrecision) 624 { 625 IEEE754_union result; 626 627 result = IEEE754_DoubleToSingle(d); 628 629 if(result.uSize == IEEE754_UNION_IS_SINGLE && bAllowHalfPrecision) { 630 /* Cast to uint32_t is OK, because value was just successfully 631 * converted to single. */ 632 float uSingle = CopyUint32ToSingle((uint32_t)result.uValue); 633 result = IEEE754_SingleToHalf(uSingle); 634 } 635 636 return result; 637 } 638 639 640 #else /* QCBOR_DISABLE_PREFERRED_FLOAT */ 641 642 int ieee754_dummy_place_holder; 643 644 #endif /* QCBOR_DISABLE_PREFERRED_FLOAT */ 645