Defined Formats
Format parameters
This table (from example notebook value-stats) shows how gfloat has been used to tabulate properties of various floating point formats.
name: Format
B: Bits in the format
P: Precision in bits
E: Exponent field width in bits
smallest: Smallest positive value
smallest_normal: Smallest positive normal value, n/a if no finite values are normal
max: Largest finite value
num_nans: Number of NaN values
num_infs: Number of infinities (2 or 0)
name |
B |
P |
E |
smallest |
smallest_normal |
max |
num_nans |
infs |
|---|---|---|---|---|---|---|---|---|
ocp_e2m1 |
4 |
2 |
2 |
0.5 |
1 |
6 |
0 |
0 |
ocp_e2m3 |
6 |
4 |
2 |
0.125 |
1 |
7.5 |
0 |
0 |
ocp_e3m2 |
6 |
3 |
3 |
0.0625 |
0.25 |
28 |
0 |
0 |
ocp_e4m3 |
8 |
4 |
4 |
≈0.0019531 |
0.015625 |
448 |
2 |
0 |
ocp_e5m2 |
8 |
3 |
5 |
≈1.5259e-05 |
≈6.1035e-05 |
57344 |
6 |
2 |
p3109_8p1 |
8 |
1 |
7 |
≈2.1684e-19 |
≈2.1684e-19 |
≈9.2234e+18 |
1 |
2 |
p3109_8p2 |
8 |
2 |
6 |
≈2.3283e-10 |
≈4.6566e-10 |
≈2.1475e+09 |
1 |
2 |
p3109_8p3 |
8 |
3 |
5 |
≈7.6294e-06 |
≈3.0518e-05 |
49152 |
1 |
2 |
p3109_8p4 |
8 |
4 |
4 |
≈0.00097656 |
0.0078125 |
224 |
1 |
2 |
p3109_8p5 |
8 |
5 |
3 |
0.0078125 |
0.125 |
15 |
1 |
2 |
p3109_8p6 |
8 |
6 |
2 |
0.015625 |
0.5 |
3.875 |
1 |
2 |
binary16 |
16 |
11 |
5 |
≈5.9605e-08 |
≈6.1035e-05 |
65504 |
2046 |
2 |
bfloat16 |
16 |
8 |
8 |
≈9.1835e-41 |
≈1.1755e-38 |
≈3.3895e+38 |
254 |
2 |
binary32 |
32 |
24 |
8 |
≈1.4013e-45 |
≈1.1755e-38 |
≈3.4028e+38 |
≈1.6777e+07 |
2 |
binary64 |
64 |
53 |
11 |
4.9407e-324 |
≈2.2251e-308 |
≈1.7977e+308 |
≈9.0072e+15 |
2 |
ocp_e8m0 |
8 |
1 |
8 |
≈5.8775e-39 |
≈5.8775e-39 |
≈1.7014e+38 |
1 |
0 |
ocp_int8 |
8 |
8 |
0 |
0.015625 |
n/a |
≈ 1.9844 |
0 |
0 |
In the above table, values which are not exact are indicated with the “≈” symbol. And here’s the same table, but with values which don’t render exactly as short floats printed as rationals times powers of 2:
name |
B |
P |
E |
smallest |
smallest_normal |
max |
num_nans |
infs |
|---|---|---|---|---|---|---|---|---|
ocp_e2m1 |
4 |
2 |
2 |
0.5 |
1 |
6 |
0 |
0 |
ocp_e2m3 |
6 |
4 |
2 |
0.125 |
1 |
7.5 |
0 |
0 |
ocp_e3m2 |
6 |
3 |
3 |
0.0625 |
0.25 |
28 |
0 |
0 |
ocp_e4m3 |
8 |
4 |
4 |
2^-9 |
0.015625 |
448 |
2 |
0 |
ocp_e5m2 |
8 |
3 |
5 |
2^-16 |
2^-14 |
57344 |
6 |
2 |
p3109_8p1 |
8 |
1 |
7 |
2^-62 |
2^-62 |
2^63 |
1 |
2 |
p3109_8p2 |
8 |
2 |
6 |
2^-32 |
2^-31 |
2^31 |
1 |
2 |
p3109_8p3 |
8 |
3 |
5 |
2^-17 |
2^-15 |
49152 |
1 |
2 |
p3109_8p4 |
8 |
4 |
4 |
2^-10 |
0.0078125 |
224 |
1 |
2 |
p3109_8p5 |
8 |
5 |
3 |
0.0078125 |
0.125 |
15 |
1 |
2 |
p3109_8p6 |
8 |
6 |
2 |
0.015625 |
0.5 |
3.875 |
1 |
2 |
binary16 |
16 |
11 |
5 |
2^-24 |
2^-14 |
65504 |
2046 |
2 |
bfloat16 |
16 |
8 |
8 |
2^-133 |
2^-126 |
255/128*2^127 |
254 |
2 |
binary32 |
32 |
24 |
8 |
2^-149 |
2^-126 |
16777215/8388608*2^127 |
8388607/4194304*2^23 |
2 |
binary64 |
64 |
53 |
11 |
4.9407e-324 |
2^-1022 |
9007199254740991/9007199254740992*2^1024 |
4503599627370495/4503599627370496*2^53 |
2 |
ocp_e8m0 |
8 |
1 |
8 |
2^-127 |
2^-127 |
2^127 |
1 |
0 |
ocp_int8 |
8 |
8 |
0 |
0.015625 |
n/a |
127/64*2^0 |
0 |
0 |
IEEE 754 Formats
- gfloat.formats.format_info_binary16 = FormatInfo(name='binary16', k=16, precision=11, bias=15, is_signed=True, domain=<Domain.Extended: 2>, has_nz=True, num_high_nans=1023, has_subnormals=True, is_twos_complement=False)
FormatInfo for IEEE-754 Binary16 format
- gfloat.formats.format_info_binary32 = FormatInfo(name='binary32', k=32, precision=24, bias=127, is_signed=True, domain=<Domain.Extended: 2>, has_nz=True, num_high_nans=8388607, has_subnormals=True, is_twos_complement=False)
FormatInfo for IEEE-754 Binary32 format
- gfloat.formats.format_info_binary64 = FormatInfo(name='binary64', k=64, precision=53, bias=1023, is_signed=True, domain=<Domain.Extended: 2>, has_nz=True, num_high_nans=4503599627370495, has_subnormals=True, is_twos_complement=False)
FormatInfo for IEEE-754 Binary64 format
BFloat16
- gfloat.formats.format_info_bfloat16 = FormatInfo(name='bfloat16', k=16, precision=8, bias=127, is_signed=True, domain=<Domain.Extended: 2>, has_nz=True, num_high_nans=127, has_subnormals=True, is_twos_complement=False)
FormatInfo for Google BFloat16 format
Open Compute Platform (OCP) Formats
- gfloat.formats.format_info_ocp_e5m2 = FormatInfo(name='ocp_e5m2', k=8, precision=3, bias=15, is_signed=True, domain=<Domain.Extended: 2>, has_nz=True, num_high_nans=3, has_subnormals=True, is_twos_complement=False)
FormatInfo for OCP E5M2 format
- gfloat.formats.format_info_ocp_e4m3 = FormatInfo(name='ocp_e4m3', k=8, precision=4, bias=7, is_signed=True, domain=<Domain.Finite: 1>, has_nz=True, num_high_nans=1, has_subnormals=True, is_twos_complement=False)
FormatInfo for OCP E4M3 format
- gfloat.formats.format_info_ocp_e3m2 = FormatInfo(name='ocp_e3m2', k=6, precision=3, bias=3, is_signed=True, domain=<Domain.Finite: 1>, has_nz=True, num_high_nans=0, has_subnormals=True, is_twos_complement=False)
FormatInfo for OCP MX E3M2 format
- gfloat.formats.format_info_ocp_e2m3 = FormatInfo(name='ocp_e2m3', k=6, precision=4, bias=1, is_signed=True, domain=<Domain.Finite: 1>, has_nz=True, num_high_nans=0, has_subnormals=True, is_twos_complement=False)
FormatInfo for OCP MX E2M3 format
- gfloat.formats.format_info_ocp_e2m1 = FormatInfo(name='ocp_e2m1', k=4, precision=2, bias=1, is_signed=True, domain=<Domain.Finite: 1>, has_nz=True, num_high_nans=0, has_subnormals=True, is_twos_complement=False)
FormatInfo for OCP MX E2M1 format
- gfloat.formats.format_info_ocp_e8m0 = FormatInfo(name='ocp_e8m0', k=8, precision=1, bias=127, is_signed=False, domain=<Domain.Finite: 1>, has_nz=False, num_high_nans=1, has_subnormals=False, is_twos_complement=False)
FormatInfo for OCP MX E8M0 format
- gfloat.formats.format_info_ocp_int8 = FormatInfo(name='ocp_int8', k=8, precision=8, bias=0, is_signed=True, domain=<Domain.Finite: 1>, has_nz=False, num_high_nans=0, has_subnormals=True, is_twos_complement=True)
FormatInfo for OCP MX INT8 format
IEEE WG P3109 Formats
- gfloat.formats.format_info_p3109(k, precision, signedness=Signedness.Signed, domain=Domain.Extended)[source]
FormatInfo for P3109 K{k} P{p} [su] [ef] formats
- Parameters:
- Returns:
FormatInfo class describing the format
- Raises:
ValueError – If p is not in 1..k
ValueError – If k is < 2
Block Formats
- gfloat.formats.format_info_mxfp8_e5m2 = BlockFormatInfo(name='mxfp8_e5m2', etype=FormatInfo(name='ocp_e5m2', k=8, precision=3, bias=15, is_signed=True, domain=<Domain.Extended: 2>, has_nz=True, num_high_nans=3, has_subnormals=True, is_twos_complement=False), k=32, stype=FormatInfo(name='ocp_e8m0', k=8, precision=1, bias=127, is_signed=False, domain=<Domain.Finite: 1>, has_nz=False, num_high_nans=1, has_subnormals=False, is_twos_complement=False))
gfloat.types.FormatInfo, k: int, stype: gfloat.types.FormatInfo)
- Type:
BlockFormatInfo(name
- Type:
str, etype
- gfloat.formats.format_info_mxfp8_e4m3 = BlockFormatInfo(name='mxfp8_e4m3', etype=FormatInfo(name='ocp_e4m3', k=8, precision=4, bias=7, is_signed=True, domain=<Domain.Finite: 1>, has_nz=True, num_high_nans=1, has_subnormals=True, is_twos_complement=False), k=32, stype=FormatInfo(name='ocp_e8m0', k=8, precision=1, bias=127, is_signed=False, domain=<Domain.Finite: 1>, has_nz=False, num_high_nans=1, has_subnormals=False, is_twos_complement=False))
gfloat.types.FormatInfo, k: int, stype: gfloat.types.FormatInfo)
- Type:
BlockFormatInfo(name
- Type:
str, etype
- gfloat.formats.format_info_mxfp6_e3m2 = BlockFormatInfo(name='mxfp6_e3m2', etype=FormatInfo(name='ocp_e3m2', k=6, precision=3, bias=3, is_signed=True, domain=<Domain.Finite: 1>, has_nz=True, num_high_nans=0, has_subnormals=True, is_twos_complement=False), k=32, stype=FormatInfo(name='ocp_e8m0', k=8, precision=1, bias=127, is_signed=False, domain=<Domain.Finite: 1>, has_nz=False, num_high_nans=1, has_subnormals=False, is_twos_complement=False))
gfloat.types.FormatInfo, k: int, stype: gfloat.types.FormatInfo)
- Type:
BlockFormatInfo(name
- Type:
str, etype
- gfloat.formats.format_info_mxfp6_e2m3 = BlockFormatInfo(name='mxfp6_e2m3', etype=FormatInfo(name='ocp_e2m3', k=6, precision=4, bias=1, is_signed=True, domain=<Domain.Finite: 1>, has_nz=True, num_high_nans=0, has_subnormals=True, is_twos_complement=False), k=32, stype=FormatInfo(name='ocp_e8m0', k=8, precision=1, bias=127, is_signed=False, domain=<Domain.Finite: 1>, has_nz=False, num_high_nans=1, has_subnormals=False, is_twos_complement=False))
gfloat.types.FormatInfo, k: int, stype: gfloat.types.FormatInfo)
- Type:
BlockFormatInfo(name
- Type:
str, etype
- gfloat.formats.format_info_mxfp4_e2m1 = BlockFormatInfo(name='mxfp4_e2m1', etype=FormatInfo(name='ocp_e2m1', k=4, precision=2, bias=1, is_signed=True, domain=<Domain.Finite: 1>, has_nz=True, num_high_nans=0, has_subnormals=True, is_twos_complement=False), k=32, stype=FormatInfo(name='ocp_e8m0', k=8, precision=1, bias=127, is_signed=False, domain=<Domain.Finite: 1>, has_nz=False, num_high_nans=1, has_subnormals=False, is_twos_complement=False))
gfloat.types.FormatInfo, k: int, stype: gfloat.types.FormatInfo)
- Type:
BlockFormatInfo(name
- Type:
str, etype
- gfloat.formats.format_info_mxint8 = BlockFormatInfo(name='mxint8', etype=FormatInfo(name='ocp_int8', k=8, precision=8, bias=0, is_signed=True, domain=<Domain.Finite: 1>, has_nz=False, num_high_nans=0, has_subnormals=True, is_twos_complement=True), k=32, stype=FormatInfo(name='ocp_e8m0', k=8, precision=1, bias=127, is_signed=False, domain=<Domain.Finite: 1>, has_nz=False, num_high_nans=1, has_subnormals=False, is_twos_complement=False))
gfloat.types.FormatInfo, k: int, stype: gfloat.types.FormatInfo)
- Type:
BlockFormatInfo(name
- Type:
str, etype