Skip to content

Vectorized isascii using simple loop 25+bytes/cycle for large strings #48568

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Mar 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions base/strings/basic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,38 @@ isascii(c::Char) = bswap(reinterpret(UInt32, c)) < 0x80
isascii(s::AbstractString) = all(isascii, s)
isascii(c::AbstractChar) = UInt32(c) < 0x80

@inline function _isascii(code_units::AbstractVector{CU}, first, last) where {CU}
r = zero(CU)
for n = first:last
@inbounds r |= code_units[n]
end
return 0 ≤ r < 0x80
end

#The chunking algorithm makes the last two chunks overlap inorder to keep the size fixed
@inline function _isascii_chunks(chunk_size,cu::AbstractVector{CU}, first,last) where {CU}
n=first
while n <= last - chunk_size
_isascii(cu,n,n+chunk_size-1) || return false
n += chunk_size
end
return _isascii(cu,last-chunk_size+1,last)
end
"""
isascii(cu::AbstractVector{CU}) where {CU <: Integer} -> Bool

Test whether all values in the vector belong to the ASCII character set (0x00 to 0x7f).
This function is intended to be used by other string implementations that need a fast ASCII check.
"""
function isascii(cu::AbstractVector{CU}) where {CU <: Integer}
chunk_size = 1024
chunk_threshold = chunk_size + (chunk_size ÷ 2)
first = firstindex(cu); last = lastindex(cu)
l = last - first + 1
l < chunk_threshold && return _isascii(cu,first,last)
return _isascii_chunks(chunk_size,cu,first,last)
end

## string map, filter ##

function map(f, s::AbstractString)
Expand Down
7 changes: 1 addition & 6 deletions base/strings/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -326,12 +326,7 @@ end

isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i

function isascii(s::String)
@inbounds for i = 1:ncodeunits(s)
codeunit(s, i) >= 0x80 && return false
end
return true
end
isascii(s::String) = isascii(codeunits(s))

"""
repeat(c::AbstractChar, r::Integer) -> String
Expand Down
2 changes: 2 additions & 0 deletions base/strings/substring.jl
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ function getindex(s::SubString, i::Integer)
@inbounds return getindex(s.string, s.offset + i)
end

isascii(ss::SubString{String}) = isascii(codeunits(ss))

function isvalid(s::SubString, i::Integer)
ib = true
@boundscheck ib = checkbounds(Bool, s, i)
Expand Down
26 changes: 26 additions & 0 deletions test/strings/basic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1119,6 +1119,32 @@ end
@test sprint(summary, "") == "empty String"
end

@testset "isascii" begin
N = 1
@test isascii("S"^N) == true
@test isascii("S"^(N - 1)) == true
@test isascii("S"^(N + 1)) == true

@test isascii("λ" * ("S"^(N))) == false
@test isascii(("S"^(N)) * "λ") == false

for p = 1:16
N = 2^p
@test isascii("S"^N) == true
@test isascii("S"^(N - 1)) == true
@test isascii("S"^(N + 1)) == true

@test isascii("λ" * ("S"^(N))) == false
@test isascii(("S"^(N)) * "λ") == false
@test isascii("λ"*("S"^(N - 1))) == false
@test isascii(("S"^(N - 1)) * "λ") == false
if N > 4
@test isascii("λ" * ("S"^(N - 3))) == false
@test isascii(("S"^(N - 3)) * "λ") == false
end
end
end

@testset "Plug holes in test coverage" begin
@test_throws MethodError checkbounds(Bool, "abc", [1.0, 2.0])

Expand Down