C 浮点类型

最后修改日期：2025 年 4 月 1 日

C 语言提供了几种具有不同精度和存储特性的浮点类型。本教程涵盖 IEEE 754 表示法、精度限制、硬件注意事项和实际用法模式。

我们将检查浮点的二进制表示，解释舍入模式、非规格化数，并通过实际示例演示常见陷阱。理解这些概念对于系统编程和性能敏感型应用至关重要。

C 浮点类型

C 语言提供三种主要的浮点类型，精度递增

float_types.c

#include <stdio.h>
#include <float.h>

int main() {
    float f = 3.1415926535f;       // Single precision (32-bit)
    double d = 3.141592653589793;  // Double precision (64-bit)
    long double ld = 3.14159265358979323846L; // Extended precision
    
    printf("float:       %.15f\n", f);
    printf("double:      %.15lf\n", d);
    printf("long double: %.21Lf\n", ld);
    
    printf("\nPrecision:\n");
    printf("float mantissa bits:  %d\n", FLT_MANT_DIG);
    printf("double mantissa bits: %d\n", DBL_MANT_DIG);
    printf("long double mantissa bits: %d\n", LDBL_MANT_DIG);
    
    return 0;
}

标准 C 浮点类型遵循 IEEE 754 规范（在支持的情况下）

float：32 位单精度（24 位尾数）
double：64 位双精度（53 位尾数）
long double：实现定义（x86 上为 80 位）

float.h 头文件定义了 FLT_MANT_DIG 等常量，用于揭示实现细节。请注意，long double 的精度因架构而异。

IEEE 754 二进制表示

浮点数使用符号-指数-尾数格式

float_representation.c

#include <stdio.h>
#include <stdint.h>

void print_float_bits(float f) {
    uint32_t* p = (uint32_t*)&f;
    uint32_t bits = *p;
    
    uint32_t sign = bits >> 31;
    uint32_t exponent = (bits >> 23) & 0xFF;
    uint32_t mantissa = bits & 0x7FFFFF;
    
    printf("Float: %f\n", f);
    printf("Sign: %d\n", sign);
    printf("Exponent: 0x%X (%d biased, %d actual)\n", 
           exponent, exponent, exponent - 127);
    printf("Mantissa: 0x%X\n", mantissa);
    printf("Binary: ");
    for (int i = 31; i >= 0; i--) {
        printf("%d", (bits >> i) & 1);
        if (i == 31 || i == 23) printf(" ");
    }
    printf("\n\n");
}

int main() {
    print_float_bits(1.0f);
    print_float_bits(0.1f);
    print_float_bits(-3.5f);
    return 0;
}

IEEE 754 单精度格式包括

1 位符号位：0 为正，1 为负
8 位指数位：存储时带有 127 的偏差
23 位尾数位：隐含前导 1（规格化数）

值为 (-1)^sign × 2^exponent-127 × 1.mantissa₂

特殊浮点值

IEEE 754 定义了特殊的位模式

special_values.c

#include <stdio.h>
#include <math.h>

int main() {
    float inf = INFINITY;
    float nan = NAN;
    float zero = 0.0f;
    float neg_zero = -0.0f;
    
    printf("Positive infinity: %f\n", inf);
    printf("NaN: %f\n", nan);
    printf("Zero: %f\n", zero);
    printf("Negative zero: %f\n", neg_zero);
    
    printf("\nSpecial comparisons:\n");
    printf("inf == inf: %d\n", inf == inf);  // 1
    printf("nan == nan: %d\n", nan == nan);  // 0
    printf("zero == neg_zero: %d\n", zero == neg_zero); // 1
    
    printf("\nClassification:\n");
    printf("isinf(inf): %d\n", isinf(inf));
    printf("isnan(nan): %d\n", isnan(nan));
    printf("isnormal(1.0f): %d\n", isnormal(1.0f));
    printf("fpclassify(denormal): %d\n", fpclassify(1e-45f));
    
    return 0;
}

特殊的浮点值包括

无穷大：0x7F800000（正无穷大），0xFF800000（负无穷大）
NaN：指数为 255 且尾数不为 0 的任何值
零：指数为 0，尾数为 0（符号区分 ±0）
非规格化数：指数为 0，尾数不为 0

math.h 头文件提供了分类宏（isnan、isinf 等）以便正确处理。

精度与舍入

浮点运算涉及舍入

rounding.c

#include <stdio.h>
#include <fenv.h>

void show_rounding_mode() {
    switch (fegetround()) {
        case FE_TONEAREST:  printf("FE_TONEAREST\n"); break;
        case FE_DOWNWARD:   printf("FE_DOWNWARD\n"); break;
        case FE_UPWARD:     printf("FE_UPWARD\n"); break;
        case FE_TOWARDZERO: printf("FE_TOWARDZERO\n"); break;
        default:            printf("Unknown\n");
    }
}

int main() {
    printf("Default rounding: ");
    show_rounding_mode();
    
    // Demonstrate rounding effects
    float a = 1.0f / 3.0f;
    printf("1/3 as float: %.20f\n", a);
    
    // Change rounding mode
    fesetround(FE_UPWARD);
    printf("Current rounding: ");
    show_rounding_mode();
    
    float b = 1.0f / 3.0f;
    printf("1/3 with FE_UPWARD: %.20f\n", b);
    
    return 0;
}

关键精度概念

机器 epsilon：FLT_EPSILON（float 为 2^-23）
舍入模式：就近舍入（默认）、向上舍入、向下舍入、向零舍入
保护位：计算过程中使用的额外精度

fenv.h 头文件提供了对舍入模式和浮点环境的控制。

非规格化数

非常小的数使用非规格化表示

denormals.c

#include <stdio.h>
#include <float.h>

int main() {
    float normal = FLT_MIN;          // Smallest normal number
    float denormal = normal / 2.0f;  // Becomes denormal
    
    printf("FLT_MIN: %e\n", normal);
    printf("FLT_MIN/2: %e\n", denormal);
    
    printf("\nProperties:\n");
    printf("isnormal(normal): %d\n", isnormal(normal));
    printf("isnormal(denormal): %d\n", isnormal(denormal));
    printf("fpclassify(denormal): %d\n", fpclassify(denormal));
    
    // Performance impact
    volatile float sum = 0.0f;
    for (int i = 0; i < 1000000; i++) {
        sum += denormal;  // Much slower than normal floats
    }
    
    return 0;
}

非规格化数

指数为 0，尾数不为 0
表示比 FLT_MIN 小的值
当它们接近零时会丢失精度
通常会导致显著的性能下降

某些系统为了性能会将非规格化数刷（flush）为零 (FTZ)。

误差累积

浮点误差在计算中会累加

error_accumulation.c

#include <stdio.h>
#include <math.h>

int main() {
    // Classic precision problem
    float sum = 0.0f;
    for (int i = 0; i < 10000; i++) {
        sum += 0.01f;
    }
    printf("Sum of 0.01 10000 times: %.10f\n", sum);
    
    // Kahan summation algorithm
    float kahan_sum = 0.0f;
    float c = 0.0f;  // Compensation
    for (int i = 0; i < 10000; i++) {
        float y = 0.01f - c;
        float t = kahan_sum + y;
        c = (t - kahan_sum) - y;
        kahan_sum = t;
    }

    printf("Kahan sum: %.10f\n", kahan_sum);
    
    // Catastrophic cancellation
    float x = 1e8f;
    float y = x + 1.0f;
    printf("(1e8 + 1) - 1e8 = %.1f\n", y - x);
    
    return 0;
}

常见误差源

舍入误差：每次运算都会引入微小误差
灾难性抵消：两个近似相等数字的减法
吸收：将小数字加到大数字上

Kahan 求和算法演示了如何减少累积误差。

浮点异常

浮点运算可能引发异常

exceptions.c

#include <stdio.h>
#include <fenv.h>
#include <math.h>

#pragma STDC FENV_ACCESS ON

void show_exceptions() {

    printf("Raised exceptions: ");
    if (fetestexcept(FE_DIVBYZERO)) printf("FE_DIVBYZERO ");
    if (fetestexcept(FE_INVALID)) printf("FE_INVALID ");
    if (fetestexcept(FE_OVERFLOW)) printf("FE_OVERFLOW ");
    if (fetestexcept(FE_UNDERFLOW)) printf("FE_UNDERFLOW ");
    if (fetestexcept(FE_INEXACT)) printf("FE_INEXACT ");
    printf("\n");
}

int main() {
    feclearexcept(FE_ALL_EXCEPT);
    
    float x = 1.0f / 0.0f;  // Division by zero
    show_exceptions();
    
    feclearexcept(FE_ALL_EXCEPT);
    float y = sqrt(-1.0f);   // Invalid operation
    show_exceptions();
    
    feclearexcept(FE_ALL_EXCEPT);
    float z = FLT_MAX * 2.0f; // Overflow
    show_exceptions();
    
    return 0;
}

标准浮点异常

FE_DIVBYZERO：除以零
FE_INVALID：无效操作（sqrt(-1)）
FE_OVERFLOW：结果过大无法表示
FE_UNDERFLOW：结果过小无法表示
FE_INEXACT：不精确结果（发生舍入）

异常处理需要仔细管理浮点环境。

硬件注意事项

浮点性能因架构而异

hardware.c

#include <stdio.h>

void print_fpu_control() {
    #if defined(__x86_64__) || defined(__i386__)
    unsigned short cw;
    __asm__ __volatile__ ("fstcw %0" : "=m" (cw));
    printf("FPU control word: 0x%04X\n", cw);
    #endif
}

int main() {
    printf("FPU features:\n");
    
    #ifdef __SSE2__
    printf("SSE2 available\n");
    #endif
    
    #ifdef __AVX__
    printf("AVX available\n");
    #endif
    
    print_fpu_control();
    
    // SIMD example
    float a[4] = {1.0f, 2.0f, 3.0f, 4.0f};
    float b[4] = {5.0f, 6.0f, 7.0f, 8.0f};
    float c[4];
    
    #ifdef __SSE__
    __asm__ (
        "movups %1, %%xmm0\n"
        "movups %2, %%xmm1\n"
        "addps %%xmm1, %%xmm0\n"
        "movups %%xmm0, %0"
        : "=m" (c)
        : "m" (a), "m" (b)
    );
    printf("SIMD add: %.1f, %.1f, %.1f, %.1f\n", 
           c[0], c[1], c[2], c[3]);
    #endif
    
    return 0;
}

关键硬件方面

x87 FPU：传统浮点堆栈架构
SSE/AVX：现代 SIMD 浮点指令
控制寄存器：管理舍入、精度、异常
性能：非规格化数、精度混合会影响速度

现代编译器会根据目标架构生成优化代码。

最佳实践

选择合适的精度：内存受限时使用 float，大多数计算使用 double
避免相等性比较：改用相对误差检查
最小化操作：减少误差累积
注意硬件影响：非规格化数、SIMD 对齐
使用编译器标志：-ffast-math（谨慎使用）、-mfpmath
考虑替代方案：某些应用使用定点数

资料来源

作者

我叫 Jan Bodnar，我是一名热情的程序员，拥有丰富的编程经验。我自 2007 年以来一直撰写编程文章。至今，我已撰写了 1,400 多篇文章和 8 本电子书。我在编程教学方面拥有十多年的经验。