Closed
Description
Below are threee functions using @avxt, @avx, @inbounds @simd
. In my practical application of this function, @avx
and @avxt
often both turn out slower than @inbounds @simd
, even though @avxt
clearly dominates the benchmark below. @avx
is consistently slower than the other alternatives. I'm running this on a Intel Core i7-10510U CPU @ 1.80GHz on julia v1.6-rc1
function fun_avxt(D::AbstractVector{S}, QT::AbstractVector{S}, μ, σ, m::Int, i::Int) where S <: Number
@assert i <= length(D)
mμ = m*μ[i]
mσ = m*σ[i]
O = zero(S)
@avxt for j = eachindex(D)
frac = (QT[j] - mμ*μ[j]) / (mσ*σ[j])
D[j] = sqrt(max(2m*(1-frac), O))
end
D[i] = typemax(eltype(D))
D
end
function fun_avx(D::AbstractVector{S}, QT::AbstractVector{S}, μ, σ, m::Int, i::Int) where S <: Number
@assert i <= length(D)
mμ = m*μ[i]
mσ = m*σ[i]
O = zero(S)
@avx for j = eachindex(D)
frac = (QT[j] - mμ*μ[j]) / (mσ*σ[j])
D[j] = sqrt(max(2m*(1-frac), O))
end
D[i] = typemax(eltype(D))
D
end
function fun_simd(D::AbstractVector{S}, QT::AbstractVector{S}, μ, σ, m::Int, i::Int) where S <: Number
@assert i <= length(D)
mμ = m*μ[i]
mσ = m*σ[i]
O = zero(S)
@inbounds @fastmath @simd for j = eachindex(D)
frac = (QT[j] - mμ*μ[j]) / (mσ*σ[j])
D[j] = sqrt(max(2m*(1-frac), O))
end
D[i] = typemax(eltype(D))
D
end
D = zeros(10000)
QT = zeros(10000)
μ = zeros(10000)
σ = zeros(10000)
m = 10
i = 1
@btime fun_avxt($D,$QT,$μ,$σ,$m,$i) # 2.691 μs (0 allocations: 0 bytes)
@btime fun_avx($D,$QT,$μ,$σ,$m,$i) # 10.679 μs (0 allocations: 0 bytes)
@btime fun_simd($D,$QT,$μ,$σ,$m,$i) # 9.927 μs (0 allocations: 0 bytes)
D = zeros(Float32, 10000)
QT = zeros(Float32, 10000)
μ = zeros(Float32, 10000)
σ = zeros(Float32, 10000)
m = 10
i = 1
@btime fun_avxt($D,$QT,$μ,$σ,$m,$i) # 1.095 μs (0 allocations: 0 bytes)
@btime fun_avx($D,$QT,$μ,$σ,$m,$i) # 3.218 μs (0 allocations: 0 bytes)
@btime fun_simd($D,$QT,$μ,$σ,$m,$i) # 2.054 μs (0 allocations: 0 bytes)
Metadata
Metadata
Assignees
Labels
No labels