Skip to content

Commit b2dac5c

Browse files
committed
Add optimized methods for UTF-8 strings
1 parent 53c88a4 commit b2dac5c

File tree

3 files changed

+226
-0
lines changed

3 files changed

+226
-0
lines changed

src/TextParse.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ include("util.jl")
77
include("field.jl")
88
include("record.jl")
99

10+
include("utf8optimizations.jl")
11+
1012
include("guesstype.jl")
1113
include("csv.jl")
1214

src/utf8optimizations.jl

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
@inline function eatwhitespaces(str::String, i=1, len=lastindex(str))
2+
while i<=len
3+
@inbounds b = codeunit(str, i)
4+
5+
if b==0x20 # This is ' '
6+
i += 1
7+
else
8+
break
9+
end
10+
end
11+
return i
12+
end
13+
14+
@inline function eatnewlines(str::String, i=1, len=lastindex(str))
15+
count = 0
16+
while i<=len
17+
@inbounds b = codeunit(str, i)
18+
if b == 0xd # '\r'
19+
i += 1
20+
if i<=len
21+
@inbounds b = codeunit(str, i)
22+
if b == 0xa # '\n'
23+
i += 1
24+
end
25+
end
26+
count += 1
27+
elseif b == 0xa
28+
i += 1
29+
if i<=len
30+
@inbounds b = codeunit(str, i)
31+
if b == 0xd
32+
i += 1
33+
end
34+
end
35+
count += 1
36+
else
37+
break
38+
end
39+
end
40+
41+
return i, count
42+
end
43+
44+
@inline function tryparsenext_base10_digit(T,str::String,i, len)
45+
i > len && @goto error
46+
@inbounds b = codeunit(str,i)
47+
( (0x30 b) & (b 0x39) ) || @goto error
48+
return convert(T, b-0x30), i+1
49+
50+
@label error
51+
return nothing
52+
end
53+
54+
@inline _isdigit(b::UInt8) = ( (0x30 b) & (b 0x39) )
55+
56+
@inline function parse_uint_and_stop(str::String, i, len, n::T) where {T <: Integer}
57+
ten = T(10)
58+
# specialize handling of the first digit so we can return an error
59+
max_without_overflow = div(typemax(T)-9,10) # the larg
60+
i <= len || return n, false, i
61+
@inbounds b = codeunit(str, i)
62+
if _isdigit(b) && n <= max_without_overflow
63+
n *= ten
64+
n += T(b-0x30)
65+
else
66+
return n, false, i
67+
end
68+
i += 1
69+
70+
while i <= len && n <= max_without_overflow
71+
@inbounds b = codeunit(str, i)
72+
if _isdigit(b)
73+
n *= ten
74+
n += T(b-0x30)
75+
else
76+
return n, true, i
77+
end
78+
i += 1
79+
end
80+
return n, true, i
81+
end
82+
83+
@inline function read_digits(str::String, i, len)
84+
# slurp up extra digits
85+
while i <= len
86+
@inbounds b = codeunit(str, i)
87+
if !_isdigit(b) # do nothing
88+
return i
89+
end
90+
i += 1
91+
end
92+
return i
93+
end
94+
95+
@inline function _is_e(str::String, i)
96+
@inbounds b = codeunit(str, i)
97+
return (b==0x65) | (b==0x45)
98+
end
99+
100+
@inline function _is_negative(str::String, i)
101+
@inbounds b = codeunit(str, i)
102+
return b==0x2d
103+
end
104+
105+
@inline function _is_positive(str::String, i)
106+
@inbounds b = codeunit(str, i)
107+
return b==0x2b
108+
end
109+
110+
# TODO Generally handle types other than Float64 properly
111+
@inline function tryparsenext(::Numeric{F}, str::String, i, len) where {F<:AbstractFloat}
112+
R = Nullable{F}
113+
114+
i>len && @goto error
115+
116+
negate = false
117+
@inbounds b = codeunit(str, i)
118+
if b==0x2d # '-'
119+
negate = true
120+
i += 1
121+
elseif b==0x2b # '+'
122+
i +=1
123+
end
124+
125+
# TODO Pick this type based on what floating point type we are asked for
126+
f1::Int64 = 0
127+
128+
# read an integer up to the decimal point
129+
f1, rval1, idecpt = parse_uint_and_stop(str, i, len, f1)
130+
idecpt = read_digits(str, idecpt, len) # get any trailing digits
131+
i = idecpt
132+
133+
ie = i
134+
frac_digits = 0
135+
136+
# next thing must be dec pt.
137+
if i <= len && @inbounds(codeunit(str, i)) == 0x2e # Check for '.'
138+
i += 1
139+
f1, rval2, ie = parse_uint_and_stop(str, i, len, f1)
140+
frac_digits = ie - i
141+
142+
ie = read_digits(str, ie, len) # get any trailing digits
143+
elseif !rval1 # no first number, and now no deciaml point => invalid
144+
@goto error
145+
end
146+
147+
# Next thing must be exponent
148+
i = ie
149+
eval::Int32 = 0
150+
151+
if i <= len && _is_e(str, i)
152+
i += 1
153+
154+
enegate = false
155+
if i<=len
156+
if _is_negative(str, i)
157+
enegate = true
158+
i += 1
159+
elseif _is_positive(str, i)
160+
i += 1
161+
end
162+
end
163+
eval, rval3, i = parse_uint_and_stop(str, i, len, eval)
164+
if enegate
165+
eval *= Int32(-1)
166+
end
167+
end
168+
169+
exp = eval - frac_digits
170+
171+
maxexp = 308
172+
minexp = -307
173+
174+
if frac_digits <= 15 && -22 <= exp <= 22
175+
if exp >= 0
176+
f = F(f1)*10.0^exp
177+
else
178+
f = F(f1)/10.0^(-exp)
179+
end
180+
else
181+
f = convert_to_double(f1, exp)
182+
end
183+
184+
if negate
185+
f = -f
186+
end
187+
188+
@label done
189+
return R(convert(F, f)), i
190+
191+
@label error
192+
return R(), i
193+
end

test/runtests.jl

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,21 @@ import TextParse: eatnewlines
1313
@testset "eatnewlines" begin
1414
@test eatnewlines("\n\r\nx") == (4, 2)
1515
@test eatnewlines("x\n\r\nx") == (1, 0)
16+
17+
# Also test the AbstractString variant
18+
@test eatnewlines(SubString("\n\r\nx", 1)) == (4, 2)
19+
@test eatnewlines(SubString("x\n\r\nx", 1)) == (1, 0)
1620
end
1721

22+
import TextParse: eatwhitespaces
23+
@testset "eatwhitespaces" begin
24+
@test eatwhitespaces(" x") == 3
25+
@test eatwhitespaces("x x") == 1
26+
27+
# Also test the AbstractString variant
28+
@test eatwhitespaces(SubString(" x", 1)) == 3
29+
@test eatwhitespaces(SubString("x x", 1)) == 1
30+
end
1831

1932
import TextParse: getlineend
2033
@testset "getlineend" begin
@@ -41,10 +54,28 @@ import TextParse: fromtype, Percentage
4154
@test tryparsenext(Percentage(), "33%") |> unwrap == (.33,4)
4255
@test tryparsenext(Percentage(), "3.3%") |> unwrap == (.033,5)
4356

57+
# Also test AbstractString variant
58+
@test tryparsenext(fromtype(Float64), SubString("1", 1), 1, 1) |> unwrap == (1.0, 2)
59+
@test tryparsenext(fromtype(Float64), SubString("12", 1), 1, 2) |> unwrap == (12.0, 3)
60+
@test tryparsenext(fromtype(Float64), SubString(".1", 1), 1, 2) |> unwrap == (0.1, 3)
61+
@test tryparsenext(fromtype(Float64), SubString("1.1", 1), 1, 3) |> unwrap == (1.1, 4)
62+
@test tryparsenext(fromtype(Float32), SubString("1.", 1), 1, 2) |> unwrap == (1f0,3)
63+
@test tryparsenext(fromtype(Float64), SubString("-1.1", 1), 1, 4) |> unwrap == (-1.1,5)
64+
@test tryparsenext(fromtype(Float64), SubString("-1.0e-12", 1), 1, 8) |> unwrap == (-1.0e-12,9)
65+
@test tryparsenext(fromtype(Float64), SubString("-1e-12", 1)) |> unwrap == (-1.0e-12,7)
66+
@test tryparsenext(fromtype(Float64), SubString("-1.0E-12", 1), 1, 8) |> unwrap == (-1.0e-12,9)
67+
@test tryparsenext(fromtype(Float64), SubString("5.e-3", 1), 1, 5) |> unwrap == (5.0e-3,6) # 32
68+
@test tryparsenext(Percentage(), SubString("33%", 1)) |> unwrap == (.33,4)
69+
@test tryparsenext(Percentage(), SubString("3.3%", 1)) |> unwrap == (.033,5)
70+
4471
rng = MersenneTwister(0)
4572
floats = rand(1_000)
4673
parsed_floats = map(i->get(tryparsenext(fromtype(Float64), i, 1, lastindex(i))[1]), string.(floats))
4774
@test parsed_floats == floats
75+
76+
# Also test AbstractString variant
77+
parsed_floats = map(i->get(tryparsenext(fromtype(Float64), SubString(i,1), 1, lastindex(i))[1]), string.(floats))
78+
@test parsed_floats == floats
4879
end
4980

5081

0 commit comments

Comments
 (0)