@@ -105,11 +105,135 @@ rem(x::Float16, ::Type{T}) where {T <: Normed} = rem(Float32(x), T) # avoid ove
105105
106106float (x:: Normed ) = convert (floattype (x), x)
107107
108- Base. BigFloat (x:: Normed ) = reinterpret (x)* (1 / BigFloat (rawone (x)))
108+ macro f32 (x:: Float64 ) # just for hexadecimal floating-point literals
109+ :(Float32 ($ x))
110+ end
111+ macro exp2 (n)
112+ :(_exp2 (Val ($ (esc (n)))))
113+ end
114+ _exp2 (:: Val{N} ) where {N} = exp2 (N)
115+
116+ # for Julia v1.0, which does not fold `div_float` before inlining
117+ inv_rawone (x) = (@generated ) ? (y = 1.0 / rawone (x); :($ y)) : 1.0 / rawone (x)
118+
109119function (:: Type{T} )(x:: Normed ) where {T <: AbstractFloat }
110- y = reinterpret (x)* (one (rawtype (x))/ convert (T, rawone (x)))
120+ # The following optimization for constant division may cause rounding errors.
121+ # y = reinterpret(x)*(one(rawtype(x))/convert(T, rawone(x)))
122+ # Therefore, we use a simple form here.
123+ # If you prefer speed over accuracy, consider using `scaledual` instead.
124+ y = reinterpret (x) / convert (promote_type (T, floattype (x)), rawone (x))
111125 convert (T, y) # needed for types like Float16 which promote arithmetic to Float32
112126end
127+
128+ function Base. Float16 (x:: Normed{Ti,f} ) where {Ti <: Union{UInt8, UInt16, UInt32} , f}
129+ f == 1 ? Float16 (x. i) : Float16 (Float32 (x))
130+ end
131+ function Base. Float16 (x:: Normed{Ti,f} ) where {Ti <: Union{UInt64, UInt128} , f}
132+ f == 1 ? Float16 (x. i) : Float16 (Float64 (x))
133+ end
134+
135+ function Base. Float32 (x:: Normed{UInt8,f} ) where f
136+ f == 1 && return Float32 (x. i)
137+ f == 2 && return Float32 (Int32 (x. i) * 0x101 ) * @f32 (0x550055 p- 32 )
138+ f == 3 && return Float32 (Int32 (x. i) * 0x00b ) * @f32 (0xd4c77b p- 30 )
139+ f == 4 && return Float32 (Int32 (x. i) * 0x101 ) * @f32 (0x110011 p- 32 )
140+ f == 5 && return Float32 (Int32 (x. i) * 0x003 ) * @f32 (0xb02c0b p- 30 )
141+ f == 6 && return Float32 (Int32 (x. i) * 0x049 ) * @f32 (0xe40039 p- 36 )
142+ f == 7 && return Float32 (Int32 (x. i) * 0x01f ) * @f32 (0x852b5f p- 35 )
143+ f == 8 && return Float32 (Int32 (x. i) * 0x155 ) * @f32 (0xc0f0fd p- 40 )
144+ 0.0f0
145+ end
146+ function Base. Float32 (x:: Normed{UInt16,f} ) where f
147+ f32 = Float32 (x. i)
148+ f == 1 && return f32
149+ f == 2 && return f32 * @f32 (0x55 p- 8 ) + f32 * @f32 (0x555555 p- 32 )
150+ f == 3 && return f32 * @f32 (0x49 p- 9 ) + f32 * @f32 (0x249249 p- 33 )
151+ f == 4 && return f32 * @f32 (0x11 p- 8 ) + f32 * @f32 (0x111111 p- 32 )
152+ f == 5 && return f32 * @f32 (0x21 p- 10 ) + f32 * @f32 (0x108421 p- 35 )
153+ f == 6 && return f32 * @f32 (0x41 p- 12 ) + f32 * @f32 (0x041041 p- 36 )
154+ f == 7 && return f32 * @f32 (0x81 p- 14 ) + f32 * @f32 (0x204081 p- 42 )
155+ f == 16 && return f32 * @f32 (0x01 p- 16 ) + f32 * @f32 (0x010001 p- 48 )
156+ Float32 (x. i / rawone (x))
157+ end
158+ function Base. Float32 (x:: Normed{UInt32,f} ) where f
159+ f == 1 && return Float32 (x. i)
160+ i32 = unsafe_trunc (Int32, x. i)
161+ if f == 32
162+ rh, rl = Float32 (i32>>> 16 ), Float32 ((i32& 0xFFFF )<< 8 | (i32>>> 24 ))
163+ return muladd (rh, @f32 (0x1 p- 16 ), rl * @f32 (0x1 p- 40 ))
164+ elseif f >= 25
165+ rh, rl = Float32 (i32>>> 16 ),Float32 (((i32& 0xFFFF )<< 14 ) + (i32>>> (f- 14 )))
166+ return muladd (rh, Float32 (@exp2 (16 - f)), rl * Float32 (@exp2 (- 14 - f)))
167+ end
168+ # FIXME : avoid the branch in native x86_64 (non-SIMD) codes
169+ m = ifelse (i32 < 0 , 0x1 p32 * inv_rawone (x), 0.0 )
170+ Float32 (muladd (Float64 (i32), inv_rawone (x), m))
171+ end
172+ function Base. Float32 (x:: Normed{Ti,f} ) where {Ti <: Union{UInt64, UInt128} , f}
173+ f == 1 ? Float32 (x. i) : Float32 (Float64 (x))
174+ end
175+
176+ function Base. Float64 (x:: Normed{Ti,f} ) where {Ti <: Union{UInt8, UInt16} , f}
177+ Float64 (Normed {UInt32,f} (x))
178+ end
179+ function Base. Float64 (x:: Normed{UInt32,f} ) where f
180+ f64 = Float64 (x. i)
181+ f == 1 && return f64
182+ f == 2 && return (f64 * 0x040001 ) * 0x15555000015555 p- 72
183+ f == 3 && return (f64 * 0x108421 ) * 0x11b6db76924929 p- 75
184+ f == 4 && return (f64 * 0x010101 ) * 0x11000011000011 p- 72
185+ f == 5 && return (f64 * 0x108421 ) * 0x04000002000001 p- 75
186+ f == 6 && return (f64 * 0x09dfb1 ) * 0x1a56b8e38e6d91 p- 78
187+ f == 7 && return (f64 * 0x000899 ) * 0x0f01480001e029 p- 70
188+ f == 8 && return (f64 * 0x0a5a5b ) * 0x18d300000018d3 p- 80
189+ f == 9 && return (f64 * 0x001001 ) * 0x080381c8e3f201 p- 72
190+ f == 10 && return (f64 * 0x100001 ) * 0x04010000000401 p- 80
191+ f == 11 && return (f64 * 0x000009 ) * 0x0e3aaae3955639 p- 66
192+ f == 12 && return (f64 * 0x0a8055 ) * 0x186246e46e4cfd p- 84
193+ f == 13 && return (f64 * 0x002001 ) * 0x10000004000001 p- 78
194+ f == 14 && return (f64 * 0x03400d ) * 0x13b13b14ec4ec5 p- 84
195+ f == 15 && return (f64 * 0x000259 ) * 0x06d0c5a4f3a5e9 p- 75
196+ f == 16 && return (f64 * 0x011111 ) * 0x00f000ff00fff1 p- 80
197+ f == 18 && return (f64 * 0x0b06d1 ) * 0x17377445dd1231 p- 90
198+ f == 19 && return (f64 * 0x080001 ) * 0x00004000000001 p- 76
199+ f == 20 && return (f64 * 0x000101 ) * 0x0ff010ef10ff01 p- 80
200+ f == 21 && return (f64 * 0x004001 ) * 0x01fff8101fc001 p- 84
201+ f == 22 && return (f64 * 0x002945 ) * 0x18d0000000018d p- 88
202+ f == 23 && return (f64 * 0x044819 ) * 0x07794a23729429 p- 92
203+ f == 27 && return (f64 * 0x000a21 ) * 0x0006518c7df9e1 p- 81
204+ f == 28 && return (f64 * 0x00000d ) * 0x13b13b14ec4ec5 p- 84
205+ f == 30 && return (f64 * 0x001041 ) * 0x00fc003f03ffc1 p- 90
206+ f == 32 && return (f64 * 0x010101 ) * 0x00ff0000ffff01 p- 96
207+ f64 / rawone (x)
208+ end
209+ function Base. Float64 (x:: Normed{UInt64,f} ) where f
210+ f == 1 && return Float64 (x. i)
211+ if f >= 53
212+ rh = Float64 (unsafe_trunc (Int64, x. i >> 16 )) * @exp2 (16 - f) # upper 48 bits
213+ rl = Float64 (unsafe_trunc (Int32, x. i& 0xFFFF )) * @exp2 (- f) # lower 16 bits
214+ return rh + muladd (rh, @exp2 (- f), rl)
215+ end
216+ x. i / rawone (x)
217+ end
218+ function Base. Float64 (x:: Normed{UInt128,f} ) where f
219+ f == 1 && return Float64 (x. i)
220+ ih, il = unsafe_trunc (Int64, x. i>> 64 ), unsafe_trunc (Int64, x. i)
221+ rh = Float64 (ih>>> 16 ) * @exp2 (f <= 53 ? 80 : 80 - f) # upper 48 bits
222+ km = @exp2 (f <= 53 ? 48 : 48 - f) # for middle 32 bits
223+ rm = Float64 (unsafe_trunc (Int32, ih& 0xFFFF )) * (0x1 p16 * km) +
224+ Float64 (unsafe_trunc (Int32, il>>> 48 )) * km
225+ rl = Float64 (il& 0xFFFFFFFFFFFF ) * @exp2 (f <= 53 ? 0 : - f) # lower 48 bits
226+ if f <= 53
227+ return (rh + (rm + rl)) / unsafe_trunc (Int64, rawone (x))
228+ elseif f < 76
229+ return rh + (rm + muladd (rh, @exp2 (- f), rl))
230+ else
231+ return rh + (rm + rl)
232+ end
233+ end
234+
235+ Base. BigFloat (x:: Normed ) = reinterpret (x)* (1 / BigFloat (rawone (x)))
236+
113237Base. Bool (x:: Normed ) = x == zero (x) ? false : true
114238Base. Integer (x:: Normed ) = convert (Integer, x* 1.0 )
115239(:: Type{T} )(x:: Normed ) where {T <: Integer } = convert (T, x* (1 / oneunit (T)))
0 commit comments