Skip to content

Commit 7afa32f

Browse files
author
Daniel Winkler
committed
Initial array support
1 parent 9cd1047 commit 7afa32f

File tree

7 files changed

+93
-1
lines changed

7 files changed

+93
-1
lines changed

src/JSONLines.jl

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import Base.Threads.@spawn
1010

1111
export readfile,
1212
readlazy,
13+
readarrays,
1314
reset!,
1415
writefile,
1516
@MStructType,
@@ -90,4 +91,20 @@ function writefile(file, data, mode = "w")
9091
close(fi)
9192
end
9293

94+
"""
95+
readarrays(file; namesline = 1, nrows = nothing, skip = nothing)
96+
97+
Read a JSONLines file in which the rows are arrays.
98+
99+
* `file`: JSONLines file with JSON arrays (`[val1, val2, ...]`) as rows
100+
* Keyword Arguments:
101+
* `namesline = 1`: Row that contains the names of the columns
102+
* `nrows = nothing`: Number of rows to load
103+
* `skip = nothing`: Number of rows to skip before loading
104+
"""
105+
function readarrays(file; namesline = 1, nrows = nothing, skip = nothing)
106+
tups = getarrays(file, namesline, nrows, skip)
107+
return tups
108+
end
109+
93110
end # Module

src/file.jl

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,50 @@ function getfile(file, nlines, skip, usemmap)
1212
return ff
1313
end
1414

15+
function getarrays(file, namesline, nlines, skip)
16+
if isnothing(nlines)
17+
nlines = _INT_MAX
18+
end
19+
if isnothing(skip)
20+
skip = 0
21+
end
22+
fi = Mmap.mmap(file)
23+
len = lastindex(fi)
24+
if namesline > 1
25+
namesbeg = skiprows(fi, namesline-1, 0)
26+
else
27+
namesbeg = 0
28+
end
29+
namesr = detectarrayrow(fi, namesbeg)
30+
names = tuple(Symbol.(JSON3.read(fi[namesr[1]:namesr[2]]))...)
31+
rowindices = Pair{Int, Int}[]
32+
if skip > 0
33+
filestart = skiprows(fi, skip, namesr[2])
34+
if filestart == len
35+
return NamedTuple{names}(tuple(fill(missing, length(names))...))
36+
end
37+
else
38+
filestart = namesr[2]
39+
end
40+
row = detectarrayrow(fi, filestart)
41+
if isrow(row)
42+
push!(rowindices, row)
43+
end
44+
if iseof(row, len)
45+
return [NamedTuple{names}(tuple(JSON3.read(fi[r[1]:r[2]])...)) for r in rowindices]
46+
end
47+
for rowi in 2:nlines
48+
row = detectarrayrow(fi, rowindices[rowi-1][2])
49+
if isrow(row)
50+
push!(rowindices, row)
51+
end
52+
if iseof(row, len)
53+
return [NamedTuple{names}(tuple(JSON3.read(fi[r[1]:r[2]])...)) for r in rowindices]
54+
end
55+
end
56+
return [NamedTuple{names}(tuple(JSON3.read(fi[r[1]:r[2]])...)) for r in rowindices]
57+
end
58+
1559
# Read everything into ram
1660
function readstr(file)
1761
fi = read(file)

src/helpers.jl

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
const _LSEP = UInt8('\n')
22
const _EOL = UInt8('}')
33
const _BOL = UInt8('{')
4+
const _ABOL = UInt8('[')
45
const _INT_MAX = typemax(Int)
56

67
# Detect space in UInt8
@@ -22,6 +23,19 @@ function detectrow(file::Vector{UInt8}, prevend::Int)
2223
return rowstart => rowend
2324
end
2425

26+
function detectarrayrow(file::Vector{UInt8}, prevend::Int)
27+
searchstart = nextind(file, prevend)
28+
rowstart = findnext(isequal(_ABOL), file, searchstart)
29+
rowend = findnext(isequal(_LSEP), file, searchstart)
30+
if isnothing(rowstart)
31+
rowstart = lastindex(file)
32+
end
33+
if isnothing(rowend)
34+
rowend = lastindex(file)
35+
end
36+
return rowstart => rowend
37+
end
38+
2539
function skiprows(file::Vector{UInt8}, n::Int, prevend::Int = 0)
2640
ind = nextind(file, prevend)
2741
for _ in 1:n

test/Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
33
Pipe = "b98c9c47-44ae-5843-9183-064241ee97a0"
44
RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
5+
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
56
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

test/runtests.jl

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
using JSONLines
2-
using Test, DataFrames, RDatasets, Pipe
2+
using Test, DataFrames, RDatasets, Pipe, Tables
33

44
full_web = readfile("testfiles/jsonlwebsite.jsonl") |> DataFrame;
55
nrow_fw = nrow(full_web)
@@ -45,6 +45,12 @@ end
4545
@test [x for x in readlazy("testfiles/escapedeol.jsonl")] |> DataFrame == escaped
4646
end
4747

48+
@testset "Read arrays" begin
49+
@test Tables.columntable(readarrays("testfiles/array.jsonl", namesline = 2)).a[2] == 4
50+
@test Tables.columntable(readarrays("testfiles/array.jsonl", skip = 1, namesline = 2)).a[1] == 4
51+
@test Tables.columntable(readarrays("testfiles/jsonlwebsitearray.jsonl")).Score == [24, 29, 14, 19]
52+
end
53+
4854
@testset "select" begin
4955
webl = @pipe readlazy("testfiles/jsonlwebsite.jsonl", returnparsed = false) |> JSONLines.select(_, :name) |> DataFrame
5056
@test webl == full_web[:, [:name]]
@@ -179,6 +185,7 @@ end
179185
@MStructType EscType name
180186
@test readfile("testfiles/escapedeol.jsonl", structtype = EscType) |> DataFrame == escaped[:, [:name]]
181187
end
188+
182189
# Cleanup
183190
rm("full_web.jsonl")
184191
rm("full_mtcars.jsonl")

test/testfiles/array.jsonl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
% My comment
2+
["a", "bcd", "efg"]
3+
[1, 2, 3]
4+
[4, 5, 6]
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
["Name", "Session", "Score", "Completed"]
2+
["Gilbert", "2013", 24, true]
3+
["Alexa", "2013", 29, true]
4+
["May", "2012B", 14, false]
5+
["Deloise", "2012A", 19, true]

0 commit comments

Comments
 (0)