Skip to content

Commit 0e5f508

Browse files
committed
Add optimized methods for UTF-8 strings
1 parent 53c88a4 commit 0e5f508

File tree

2 files changed

+195
-0
lines changed

2 files changed

+195
-0
lines changed

src/TextParse.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ include("util.jl")
77
include("field.jl")
88
include("record.jl")
99

10+
include("utf8optimizations.jl")
11+
1012
include("guesstype.jl")
1113
include("csv.jl")
1214

src/utf8optimizations.jl

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
@inline function eatwhitespaces(str::String, i=1, len=lastindex(str))
2+
while i<=len
3+
@inbounds b = codeunit(str, i)
4+
5+
if b==0x20 # This is ' '
6+
i += 1
7+
else
8+
break
9+
end
10+
end
11+
return i
12+
end
13+
14+
@inline function eatnewlines(str::String, i=1, len=lastindex(str))
15+
count = 0
16+
while i<=len
17+
@inbounds b = codeunit(str, i)
18+
if b == 0xd # '\r'
19+
i += 1
20+
if i<=len
21+
@inbounds b = codeunit(str, i)
22+
if b == 0xa # '\n'
23+
i += 1
24+
end
25+
end
26+
count += 1
27+
elseif b == 0xa
28+
i += 1
29+
if i<=len
30+
@inbounds b = codeunit(str, i)
31+
if b == 0xd
32+
i += 1
33+
end
34+
end
35+
count += 1
36+
else
37+
break
38+
end
39+
end
40+
41+
return i, count
42+
end
43+
44+
@inline function tryparsenext_base10_digit(T,str::String,i, len)
45+
i > len && @goto error
46+
@inbounds b = codeunit(str,i)
47+
( (0x30 b) & (b 0x39) ) || @goto error
48+
return convert(T, b-0x30), i+1
49+
50+
@label error
51+
return nothing
52+
end
53+
54+
@inline _isdigit(b::UInt8) = ( (0x30 b) & (b 0x39) )
55+
56+
@inline function parse_uint_and_stop(str::String, i, len, n::T) where {T <: Integer}
57+
ten = T(10)
58+
# specialize handling of the first digit so we can return an error
59+
max_without_overflow = div(typemax(T)-9,10) # the larg
60+
i <= len || return n, false, i
61+
@inbounds b = codeunit(str, i)
62+
if _isdigit(b) && n <= max_without_overflow
63+
n *= ten
64+
n += T(b-0x30)
65+
else
66+
return n, false, i
67+
end
68+
i += 1
69+
70+
while i <= len && n <= max_without_overflow
71+
@inbounds b = codeunit(str, i)
72+
if _isdigit(b)
73+
n *= ten
74+
n += T(b-0x30)
75+
else
76+
return n, true, i
77+
end
78+
i += 1
79+
end
80+
return n, true, i
81+
end
82+
83+
@inline function read_digits(str::String, i, len)
84+
# slurp up extra digits
85+
while i <= len
86+
@inbounds b = codeunit(str, i)
87+
if !_isdigit(b) # do nothing
88+
return i
89+
end
90+
i += 1
91+
end
92+
return i
93+
end
94+
95+
@inline function _is_e(str::String, i)
96+
@inbounds b = codeunit(str, i)
97+
return (b==0x65) | (b==0x45)
98+
end
99+
100+
@inline function _is_negative(str::String, i)
101+
@inbounds b = codeunit(str, i)
102+
return b==0x2d
103+
end
104+
105+
@inline function _is_positive(str::String, i)
106+
@inbounds b = codeunit(str, i)
107+
return b==0x2b
108+
end
109+
110+
# TODO Generally handle types other than Float64 properly
111+
@inline function tryparsenext(::Numeric{F}, str::String, i, len) where {F<:AbstractFloat}
112+
R = Nullable{F}
113+
114+
i>len && @goto error
115+
116+
negate = false
117+
@inbounds b = codeunit(str, i)
118+
if b==0x2d # '-'
119+
negate = true
120+
i += 1
121+
elseif b==0x2b # '+'
122+
i +=1
123+
end
124+
125+
# TODO Pick this type based on what floating point type we are asked for
126+
f1::Int64 = 0
127+
128+
# read an integer up to the decimal point
129+
f1, rval1, idecpt = parse_uint_and_stop(str, i, len, f1)
130+
idecpt = read_digits(str, idecpt, len) # get any trailing digits
131+
i = idecpt
132+
133+
ie = i
134+
frac_digits = 0
135+
136+
# next thing must be dec pt.
137+
if i <= len && @inbounds(codeunit(str, i)) == 0x2e # Check for '.'
138+
i += 1
139+
f1, rval2, ie = parse_uint_and_stop(str, i, len, f1)
140+
frac_digits = ie - i
141+
142+
ie = read_digits(str, ie, len) # get any trailing digits
143+
elseif !rval1 # no first number, and now no deciaml point => invalid
144+
@goto error
145+
end
146+
147+
# Next thing must be exponent
148+
i = ie
149+
eval::Int32 = 0
150+
151+
if i <= len && _is_e(str, i)
152+
i += 1
153+
154+
enegate = false
155+
if i<=len
156+
if _is_negative(str, i)
157+
enegate = true
158+
i += 1
159+
elseif _is_positive(str, i)
160+
i += 1
161+
end
162+
end
163+
eval, rval3, i = parse_uint_and_stop(str, i, len, eval)
164+
if enegate
165+
eval *= Int32(-1)
166+
end
167+
end
168+
169+
exp = eval - frac_digits
170+
171+
maxexp = 308
172+
minexp = -307
173+
174+
if frac_digits <= 15 && -22 <= exp <= 22
175+
if exp >= 0
176+
f = F(f1)*10.0^exp
177+
else
178+
f = F(f1)/10.0^(-exp)
179+
end
180+
else
181+
f = convert_to_double(f1, exp)
182+
end
183+
184+
if negate
185+
f = -f
186+
end
187+
188+
@label done
189+
return R(convert(F, f)), i
190+
191+
@label error
192+
return R(), i
193+
end

0 commit comments

Comments
 (0)