Skip to content

Fix performance regressions and float parsing #74

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Oct 19, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
*.mem
docs/build/
docs/site/
benchmark/tune.json
2 changes: 2 additions & 0 deletions REQUIRE
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@ Nullables
DataStructures
WeakRefStrings 0.4.4
CodecZlib
Nullables
DoubleFloats
75 changes: 75 additions & 0 deletions benchmark/benchmarks.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
using BenchmarkTools
using TextParse
VERSION >= v"0.7" && using Dates

const SUITE = BenchmarkGroup()

SUITE["util"] = BenchmarkGroup(["string", "unicode"])

our_lastindex(x) = VERSION >= v"0.7" ? lastindex(x) : endof(x)

float64str = "456.254"
float64strlen = our_lastindex(float64str)
negfloat64str = "-456.254"
negfloat64strlen = our_lastindex(negfloat64str)

intstr = "9823345"
intstrlen = our_lastindex(intstr)
negintstr = "-9823345"
negintstrlen = our_lastindex(negintstr)

SUITE["util"]["tryparsenext_base10_digit"] = BenchmarkGroup()
SUITE["util"]["tryparsenext_base10_digit"]["Float64"] = @benchmarkable TextParse.tryparsenext_base10_digit(Float64, $float64str, 3, $float64strlen)
SUITE["util"]["tryparsenext_base10_digit"]["Int64"] = @benchmarkable TextParse.tryparsenext_base10_digit(Int64, $intstr, 3, $intstrlen)

SUITE["util"]["tryparsenext_base10"] = BenchmarkGroup()
SUITE["util"]["tryparsenext_base10"]["Float64"] = @benchmarkable TextParse.tryparsenext_base10(Float64, $float64str, 3, $float64strlen)
SUITE["util"]["tryparsenext_base10"]["Int64"] = @benchmarkable TextParse.tryparsenext_base10(Int64, $intstr, 3, $intstrlen)

SUITE["util"]["tryparsenext_sign"] = BenchmarkGroup()
SUITE["util"]["tryparsenext_sign"]["nosign"] = BenchmarkGroup()
SUITE["util"]["tryparsenext_sign"]["nosign"]["Float64"] = @benchmarkable TextParse.tryparsenext_sign($float64str, 1, $float64strlen)
SUITE["util"]["tryparsenext_sign"]["nosign"]["Int64"] = @benchmarkable TextParse.tryparsenext_sign($intstr, 1, $intstrlen)
SUITE["util"]["tryparsenext_sign"]["neg"] = BenchmarkGroup()
SUITE["util"]["tryparsenext_sign"]["neg"]["Float64"] = @benchmarkable TextParse.tryparsenext_sign($negfloat64str, 1, $negfloat64strlen)
SUITE["util"]["tryparsenext_sign"]["neg"]["Int64"] = @benchmarkable TextParse.tryparsenext_sign($negintstr, 1, $negintstrlen)

whitespacestring = "abc de"
nowhitespacestring = "abcde"

SUITE["util"]["eatwhitespaces"] = BenchmarkGroup()
SUITE["util"]["eatwhitespaces"]["withwhitespace"] = @benchmarkable TextParse.eatwhitespaces($whitespacestring, 4)
SUITE["util"]["eatwhitespaces"]["nowhitespacestring"] = @benchmarkable TextParse.eatwhitespaces($whitespacestring, 4)

newlinestring = "ab\r\n\r\r"

SUITE["util"]["eatnewlines"] = BenchmarkGroup()
SUITE["util"]["eatnewlines"]["default"] = @benchmarkable TextParse.eatnewlines($newlinestring, 3)

SUITE["util"]["getlineend"] = BenchmarkGroup()
SUITE["util"]["getlineend"]["default"] = @benchmarkable TextParse.getlineend($newlinestring)

percentagestring = "35.35%"
percentagestringlen = our_lastindex(percentagestring)
somestring = "foo something,"
somestringlen = our_lastindex(somestring)
somequotedstring = "\"Owner 2 ”Vicepresident\"\"\""
somequotedstringlen = our_lastindex(somequotedstring)

longfloat64str = "2344345.1232353459389238738435"
longfloat64strlen = our_lastindex(longfloat64str)

tok = TextParse.DateTimeToken(DateTime, dateformat"yyyy-mm-dd HH:MM:SS")
opts = TextParse.LocalOpts('y', false, '"', '\\', false, false)
datetimestr = "1970-02-02 02:20:20"
datetimestrlen = our_lastindex(datetimestr)

SUITE["util"]["tryparsenext"] = BenchmarkGroup()
SUITE["util"]["tryparsenext"]["NumericFloat64"] = @benchmarkable TextParse.tryparsenext($(TextParse.Numeric(Float64)), $float64str,1,$float64strlen)
SUITE["util"]["tryparsenext"]["LongNumericFloat64"] = @benchmarkable TextParse.tryparsenext($(TextParse.Numeric(Float64)), $longfloat64str,1,$longfloat64strlen)
SUITE["util"]["tryparsenext"]["UInt64"] = @benchmarkable TextParse.tryparsenext($(TextParse.Numeric(UInt64)), $intstr,1,$intstrlen)
SUITE["util"]["tryparsenext"]["NegInt64"] = @benchmarkable TextParse.tryparsenext($(TextParse.Numeric(Int64)), $negintstr,1,$negintstrlen)
SUITE["util"]["tryparsenext"]["Percentage"] = @benchmarkable TextParse.tryparsenext($(TextParse.Percentage()), $percentagestring,1,$percentagestringlen, TextParse.default_opts)
SUITE["util"]["tryparsenext"]["StringToken"] = @benchmarkable TextParse.tryparsenext($(TextParse.StringToken(String)), $somestring,1,$somestringlen, TextParse.default_opts)
SUITE["util"]["tryparsenext"]["DateTimeToken"] = @benchmarkable TextParse.tryparsenext($tok, $datetimestr,1,$datetimestrlen, $opts)
SUITE["util"]["tryparsenext"]["QuotedStringToken"] = @benchmarkable TextParse.tryparsenext($(Quoted(String,quotechar='"', escapechar='"')), $somequotedstring)
4 changes: 3 additions & 1 deletion src/TextParse.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
module TextParse

using CodecZlib, WeakRefStrings, Dates
using CodecZlib, WeakRefStrings, Dates, Nullables, DoubleFloats

include("lib/compat.jl")
include("util.jl")
include("field.jl")
include("record.jl")

include("utf8optimizations.jl")

include("guesstype.jl")
include("csv.jl")

Expand Down
13 changes: 9 additions & 4 deletions src/csv.jl
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,9 @@ function _csvread_internal(str::AbstractString, delim=',';
rowlength_sum = 0 # sum of lengths of rows, for estimating nrows
lineno = 0

if pos <= len
c, i = iterate(str, pos)
y = iterate(str, pos)
if y!==nothing
c = y[1]; i = y[2]
if c == '\ufeff'
pos = i
end
Expand All @@ -188,7 +189,9 @@ function _csvread_internal(str::AbstractString, delim=',';
lineno += lines
while lineno < skiplines_begin
pos = getlineend(str, pos)
_, pos = iterate(str, pos)
y2 = iterate(str, pos)
y2===nothing && error("Internal error.")
pos = y2[2]
pos, lines = eatnewlines(str, pos)
lineno += lines
end
Expand Down Expand Up @@ -640,7 +643,9 @@ function quotedsplit(str, opts, includequotes, i=firstindex(str), l=lastindex(st
@chk2 x, i = tryparsenext(f, str, i, l, opts)
push!(strs, x)
end
c, i = iterate(str, prevind(str, i))
y1 = iterate(str, prevind(str, i))
y1===nothing && error("Internal error.")
c = y1[1]; i = y1[2]
if c == opts.endchar
# edge case where there's a delim at the end of the string
push!(strs, "")
Expand Down
Loading