Parquet datasource crashes on amd64 Ubuntu system without support for AVX2 instructions.
marklit opened this issue · comments
On my Ubuntu 20 system with 16 GB of RAM and 2 TB of disk capacity, I installed GoLang:
$ sudo apt update
$ sudo apt install software-properties-common
$ sudo add-apt-repository ppa:longsleep/golang-backports
$ sudo apt update
$ sudo apt install golang-go
I then cloned the master branch of OctoSQL from today.
$ git clone https://github.com/cube2222/octosql
$ cd octosql
I then dumped a 1.1B-row, 116 GB in Snappy-compressed Parquet file from the latest version of ClickHouse.
$ clickhouse client \
-q "SELECT *
FROM trips
FORMAT Parquet" \
> trips.parquet
When I ran the SQL below I got the following error.
$ OCTOSQL_NO_TELEMETRY=1 go run main.go "SELECT * FROM trips.parquet LIMIT 10"
SIGILL: illegal instruction
PC=0x8f0db5 m=7 sigcode=2
instruction bytes: 0xc4 0xe2 0x7d 0x78 0x44 0x24 0x20 0xc5 0xfe 0x6f 0x8 0xc5 0xfe 0x6f 0x50 0x20
goroutine 1 [running]:
github.com/segmentio/parquet-go/internal/bits.countByte({0xc002a4a000, 0xffe1, 0xffe1}, 0x1)
/home/mark/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/internal/bits/count_amd64.s:68 +0xf5 fp=0xc00020e0b0 sp=0xc00020e0a8 pc=0x8f0db5
github.com/segmentio/parquet-go/internal/bits.CountByte(...)
/home/mark/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/internal/bits/count.go:3
github.com/segmentio/parquet-go.countLevelsEqual({0xc002a4a000?, 0x14315c0?, 0x54c286?}, 0x0?)
/home/mark/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/page.go:244 +0x4f fp=0xc00020e100 sp=0xc00020e0b0 pc=0x9f2f2f
github.com/segmentio/parquet-go.countLevelsNotEqual(...)
/home/mark/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/page.go:248
github.com/segmentio/parquet-go.(*Column).decodeDataPageV1(0xc000126dd0, {0xc000274840?}, 0xc0000c62a0)
/home/mark/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/column.go:610 +0x414 fp=0xc00020e198 sp=0xc00020e100 pc=0x9ddf74
github.com/segmentio/parquet-go.(*filePages).ReadPage(0xc0000c6230)
/home/mark/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/file.go:517 +0x212 fp=0xc00020e2e0 sp=0xc00020e198 pc=0x9efa32
github.com/segmentio/parquet-go.(*multiPages).ReadPage(0xc000032880)
/home/mark/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/multi_row_group.go:223 +0x3f fp=0xc00020e328 sp=0xc00020e2e0 pc=0x9f133f
github.com/segmentio/parquet-go.(*columnChunkReader).readPage(0xc00016cb20)
/home/mark/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/column_chunk.go:102 +0xa2 fp=0xc00020e350 sp=0xc00020e328 pc=0x9e1542
github.com/segmentio/parquet-go.(*columnChunkReader).readValues(0xdbfba0?)
/home/mark/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/column_chunk.go:119 +0x71 fp=0xc00020e390 sp=0xc00020e350 pc=0x9e1651
github.com/segmentio/parquet-go.columnReadRowFuncOfLeaf.func1({0xc0002747e0?, 0x3, 0x4}, 0xff?, {0xc00016ca00, 0x7fd1b1757108?, 0x40?})
/home/mark/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/column_chunk.go:326 +0xc5 fp=0xc00020e428 sp=0xc00020e390 pc=0x9e2205
github.com/segmentio/parquet-go.makeColumnReadRowFunc.func1({0x0?, 0xf32600?, 0xc00016e300?}, 0xc8?, {0xc00016ca00, 0x35, 0x35})
/home/mark/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/schema.go:163 +0xa3 fp=0xc00020e490 sp=0xc00020e428 pc=0x9fb943
github.com/segmentio/parquet-go.(*rowGroupRowReader).ReadRow(0xc00016e340?, {0x0?, 0x0, 0x0?})
/home/mark/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/row_group.go:306 +0xb7 fp=0xc00020e4d8 sp=0xc00020e490 pc=0x9fa4b7
github.com/segmentio/parquet-go.(*reader).ReadRow(0xc0000c00a0, {0x0?, 0x0, 0x0?})
/home/mark/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/reader.go:276 +0xb1 fp=0xc00020e508 sp=0xc00020e4d8 pc=0x9f9331
github.com/segmentio/parquet-go.(*Reader).ReadRow(0xc0000c0090, {0x0, 0x0, 0x0})
/home/mark/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/reader.go:221 +0x65 fp=0xc00020e540 sp=0xc00020e508 pc=0x9f9205
github.com/cube2222/octosql/datasources/parquet.(*DatasourceExecuting).Run(0xc00021c9c0, {{0xf30588?, 0xc00016edc0?}, 0x0?}, 0xc00021cb10, 0x0?)
/home/mark/octosql/datasources/parquet/execution.go:47 +0x512 fp=0xc00020e6e0 sp=0xc00020e540 pc=0xa3bfd2
github.com/cube2222/octosql/execution/nodes.(*Limit).Run(0xc000033780, {{0xf30588?, 0xc00016edc0?}, 0x0?}, 0xc00021c9f0, 0xc0000a6700?)
/home/mark/octosql/execution/nodes/limit.go:34 +0x3a6 fp=0xc00020e910 sp=0xc00020e6e0 pc=0x5a6546
github.com/cube2222/octosql/outputs/batch.(*OutputPrinter).Run(0xc0000a6700, {{0xf30588?, 0xc00016edc0?}, 0x0?})
/home/mark/octosql/outputs/batch/live_output.go:81 +0x396 fp=0xc00020ea20 sp=0xc00020e910 pc=0xa5f476
github.com/cube2222/octosql/cmd.glob..func4(0x1537ce0, {0xc00022c6d0, 0x1, 0x1?})
/home/mark/octosql/cmd/root.go:463 +0x3653 fp=0xc00020fd70 sp=0xc00020ea20 pc=0xc16633
github.com/spf13/cobra.(*Command).execute(0x1537ce0, {0xc000032050, 0x1, 0x1})
/home/mark/go/pkg/mod/github.com/spf13/cobra@v1.4.0/command.go:856 +0x67c fp=0xc00020fe48 sp=0xc00020fd70 pc=0x638e5c
github.com/spf13/cobra.(*Command).ExecuteC(0x1537ce0)
/home/mark/go/pkg/mod/github.com/spf13/cobra@v1.4.0/command.go:974 +0x3b4 fp=0xc00020ff00 sp=0xc00020fe48 pc=0x6394d4
github.com/spf13/cobra.(*Command).Execute(...)
/home/mark/go/pkg/mod/github.com/spf13/cobra@v1.4.0/command.go:902
github.com/spf13/cobra.(*Command).ExecuteContext(...)
/home/mark/go/pkg/mod/github.com/spf13/cobra@v1.4.0/command.go:895
github.com/cube2222/octosql/cmd.Execute({0xf30588?, 0xc00016edc0?})
/home/mark/octosql/cmd/root.go:476 +0x53 fp=0xc00020ff20 sp=0xc00020ff00 pc=0xc18233
main.main()
/home/mark/octosql/main.go:24 +0xe8 fp=0xc00020ff80 sp=0xc00020ff20 pc=0xc18f48
runtime.main()
/usr/lib/go-1.18/src/runtime/proc.go:250 +0x212 fp=0xc00020ffe0 sp=0xc00020ff80 pc=0x43a172
runtime.goexit()
/usr/lib/go-1.18/src/runtime/asm_amd64.s:1571 +0x1 fp=0xc00020ffe8 sp=0xc00020ffe0 pc=0x46a981
goroutine 7 [syscall]:
os/signal.signal_recv()
/usr/lib/go-1.18/src/runtime/sigqueue.go:151 +0x2f
os/signal.loop()
/usr/lib/go-1.18/src/os/signal/signal_unix.go:23 +0x19
created by os/signal.Notify.func1.1
/usr/lib/go-1.18/src/os/signal/signal.go:151 +0x2a
goroutine 8 [chan receive]:
main.main.func1()
/home/mark/octosql/main.go:17 +0x31
created by main.main
/home/mark/octosql/main.go:16 +0xd9
goroutine 9 [select]:
github.com/dgraph-io/ristretto.(*defaultPolicy).processItems(0xc00021d380)
/home/mark/go/pkg/mod/github.com/dgraph-io/ristretto@v0.0.3/policy.go:96 +0x91
created by github.com/dgraph-io/ristretto.newDefaultPolicy
/home/mark/go/pkg/mod/github.com/dgraph-io/ristretto@v0.0.3/policy.go:80 +0x156
goroutine 10 [select]:
github.com/dgraph-io/ristretto.(*Cache).processItems(0xc0000775c0)
/home/mark/go/pkg/mod/github.com/dgraph-io/ristretto@v0.0.3/cache.go:314 +0xa8
created by github.com/dgraph-io/ristretto.NewCache
/home/mark/go/pkg/mod/github.com/dgraph-io/ristretto@v0.0.3/cache.go:162 +0x56a
goroutine 11 [select]:
github.com/dgraph-io/ristretto.(*defaultPolicy).processItems(0xc0002594a0)
/home/mark/go/pkg/mod/github.com/dgraph-io/ristretto@v0.0.3/policy.go:96 +0x91
created by github.com/dgraph-io/ristretto.newDefaultPolicy
/home/mark/go/pkg/mod/github.com/dgraph-io/ristretto@v0.0.3/policy.go:80 +0x156
goroutine 12 [select]:
github.com/dgraph-io/ristretto.(*Cache).processItems(0xc000077740)
/home/mark/go/pkg/mod/github.com/dgraph-io/ristretto@v0.0.3/cache.go:314 +0xa8
created by github.com/dgraph-io/ristretto.NewCache
/home/mark/go/pkg/mod/github.com/dgraph-io/ristretto@v0.0.3/cache.go:162 +0x56a
goroutine 13 [select]:
github.com/dgraph-io/ristretto.(*defaultPolicy).processItems(0xc0002635c0)
/home/mark/go/pkg/mod/github.com/dgraph-io/ristretto@v0.0.3/policy.go:96 +0x91
created by github.com/dgraph-io/ristretto.newDefaultPolicy
/home/mark/go/pkg/mod/github.com/dgraph-io/ristretto@v0.0.3/policy.go:80 +0x156
goroutine 14 [select]:
github.com/dgraph-io/ristretto.(*Cache).processItems(0xc0000778c0)
/home/mark/go/pkg/mod/github.com/dgraph-io/ristretto@v0.0.3/cache.go:314 +0xa8
created by github.com/dgraph-io/ristretto.NewCache
/home/mark/go/pkg/mod/github.com/dgraph-io/ristretto@v0.0.3/cache.go:162 +0x56a
rax 0xc002a4a000
rbx 0xc00295c001
rcx 0xc002a59fe1
rdx 0xc002a59fc0
rdi 0x1
rsi 0x0
rbp 0xc00020e0f0
rsp 0xc00020e0a8
r8 0x83ff8
r9 0x0
r10 0x0
r11 0x1
r12 0x0
r13 0x0
r14 0x0
r15 0x0
rip 0x8f0db5
rflags 0x10206
cs 0x33
fs 0x0
gs 0x0
exit status 2
Hey @marklit!
Thanks for the report, but this doesn't look like a file size issue.
This looks like a bug in the parquet datasource which is experimental right now. Based on my reading of the library underneath, it uses hand-written assembly for some elements of the implementation in order to use AVX2 instructions and it looks like those crash on your machine.
Please try building OctoSQL in the following way:
CGO_ENABLED=0 go build --tags purego
OCTOSQL_NO_TELEMETRY=1 ./octosql "SELECT * FROM trips.parquet LIMIT 10"
This should use a pure Go implementation of the aforementioned instructions and shouldn't crash.
Please let me know if that works.
@marklit
You should also be able to use the official release I've just created as I've added this build tag there as well.
You're right. The VM I had setup had AVX2 disabled for some reason. I got 10 records to return with your official build in 4 seconds.
I re-configured my VM to support AVX2 but still got the above error with running go run main.go
.
Running your official build again on Q1 from my benchmark suite brought back an Index out of range error.
$ ./octosql "SELECT cab_type, count(*) FROM trips.parquet GROUP BY cab_type"
panic: runtime error: index out of range [825241904] with length 19994
goroutine 1 [running]:
github.com/segmentio/parquet-go.(*byteArrayDictionary).Index(0x15be5c0?, 0x276110?)
/home/runner/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/dictionary.go:86 +0xed
github.com/segmentio/parquet-go.(*indexedPageReader).ReadValues(0xc018185950, {0xc016f37e80, 0xaa, 0xc000075f20?})
/home/runner/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/dictionary.go:338 +0x89
github.com/segmentio/parquet-go.(*optionalPageReader).ReadValues(0xc0181b33c0, {0xc016f37e80, 0xaa, 0xaa})
/home/runner/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/page.go:382 +0x14a
github.com/segmentio/parquet-go.(*columnChunkReader).readValuesFromCurrentPage(0xc0001e1300)
/home/runner/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/column_chunk.go:135 +0x90
github.com/segmentio/parquet-go.(*columnChunkReader).readValues(0x32?)
/home/runner/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/column_chunk.go:115 +0x29
github.com/segmentio/parquet-go.columnReadRowFuncOfLeaf.func1({0x0?, 0x0, 0x0}, 0x0?, {0xc0001e0a00, 0x0?, 0x0?})
/home/runner/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/column_chunk.go:326 +0xc5
github.com/segmentio/parquet-go.makeColumnReadRowFunc.func1({0x0?, 0x3?, 0x0?}, 0x0?, {0xc0001e0a00, 0x35, 0x35})
/home/runner/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/schema.go:163 +0xa3
github.com/segmentio/parquet-go.(*rowGroupRowReader).ReadRow(0x0?, {0x0?, 0x0, 0x0?})
/home/runner/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/row_group.go:306 +0xb7
github.com/segmentio/parquet-go.(*reader).ReadRow(0xc00012e0a0, {0x0?, 0x0, 0x0?})
/home/runner/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/reader.go:276 +0xb1
github.com/segmentio/parquet-go.(*Reader).ReadRow(0xc00012e090, {0x0, 0x0, 0x0})
/home/runner/go/pkg/mod/github.com/cube2222/parquet-go@v0.0.0-20220512155810-0e06eee50261/reader.go:221 +0x65
github.com/cube2222/octosql/datasources/parquet.(*DatasourceExecuting).Run(0xc0000f2fc0, {{0xf5f168?, 0xc0001e2dc0?}, 0x0?}, 0xc0000f30e0, 0x1?)
/home/runner/work/octosql/octosql/datasources/parquet/execution.go:47 +0x512
github.com/cube2222/octosql/execution/nodes.(*SimpleGroupBy).Run(0xc000075f20, {{0xf5f168?, 0xc0001e2dc0?}, 0x0?}, 0xc0000f30b0, 0xc0000f3050)
/home/runner/work/octosql/octosql/execution/nodes/simple_group_by.go:38 +0x228
github.com/cube2222/octosql/execution/nodes.(*Map).Run(0xc0000f2ff0, {{0xf5f168?, 0xc0001e2dc0?}, 0x0?}, 0xc0000f3080, 0x0?)
/home/runner/work/octosql/octosql/execution/nodes/map.go:23 +0xfc
github.com/cube2222/octosql/execution/nodes.(*Limit).Run(0xc000031720, {{0xf5f168?, 0xc0001e2dc0?}, 0x0?}, 0xc0000f3020, 0xc00011c700?)
/home/runner/work/octosql/octosql/execution/nodes/limit.go:34 +0x3a6
github.com/cube2222/octosql/outputs/batch.(*OutputPrinter).Run(0xc00011c700, {{0xf5f168?, 0xc0001e2dc0?}, 0x0?})
/home/runner/work/octosql/octosql/outputs/batch/live_output.go:81 +0x396
github.com/cube2222/octosql/cmd.glob..func4(0x156ba00, {0xc00009e0b0, 0x1, 0x1?})
/home/runner/work/octosql/octosql/cmd/root.go:463 +0x3653
github.com/spf13/cobra.(*Command).execute(0x156ba00, {0xc000030070, 0x1, 0x1})
/home/runner/go/pkg/mod/github.com/spf13/cobra@v1.4.0/command.go:856 +0x67c
github.com/spf13/cobra.(*Command).ExecuteC(0x156ba00)
/home/runner/go/pkg/mod/github.com/spf13/cobra@v1.4.0/command.go:974 +0x3b4
github.com/spf13/cobra.(*Command).Execute(...)
/home/runner/go/pkg/mod/github.com/spf13/cobra@v1.4.0/command.go:902
github.com/spf13/cobra.(*Command).ExecuteContext(...)
/home/runner/go/pkg/mod/github.com/spf13/cobra@v1.4.0/command.go:895
github.com/cube2222/octosql/cmd.Execute({0xf5f168?, 0xc0001e2dc0?})
/home/runner/work/octosql/octosql/cmd/root.go:476 +0x53
main.main()
/home/runner/work/octosql/octosql/main.go:24 +0xe8
@marklit Could you please describe exactly how you created the trips table in ClickHouse? That would help immensely with debugging this issue (and creating a sensible issue for the parquet library authors).
I imported the CSV files from my 1.1B taxi rides dataset into a Log engine table in ClickHouse.
$ clickhouse-client
CREATE TABLE trips (
trip_id UInt32,
vendor_id String,
pickup_datetime DateTime,
dropoff_datetime Nullable(DateTime),
store_and_fwd_flag Nullable(FixedString(1)),
rate_code_id Nullable(UInt8),
pickup_longitude Nullable(Float64),
pickup_latitude Nullable(Float64),
dropoff_longitude Nullable(Float64),
dropoff_latitude Nullable(Float64),
passenger_count Nullable(UInt8),
trip_distance Nullable(Float64),
fare_amount Nullable(Float32),
extra Nullable(Float32),
mta_tax Nullable(Float32),
tip_amount Nullable(Float32),
tolls_amount Nullable(Float32),
ehail_fee Nullable(Float32),
improvement_surcharge Nullable(Float32),
total_amount Nullable(Float32),
payment_type Nullable(String),
trip_type Nullable(UInt8),
pickup Nullable(String),
dropoff Nullable(String),
cab_type Nullable(String),
precipitation Nullable(Int8),
snow_depth Nullable(Int8),
snowfall Nullable(Int8),
max_temperature Nullable(Int8),
min_temperature Nullable(Int8),
average_wind_speed Nullable(Int8),
pickup_nyct2010_gid Nullable(Int8),
pickup_ctlabel Nullable(String),
pickup_borocode Nullable(Int8),
pickup_boroname Nullable(String),
pickup_ct2010 Nullable(String),
pickup_boroct2010 Nullable(String),
pickup_cdeligibil Nullable(FixedString(1)),
pickup_ntacode Nullable(String),
pickup_ntaname Nullable(String),
pickup_puma Nullable(String),
dropoff_nyct2010_gid Nullable(UInt8),
dropoff_ctlabel Nullable(String),
dropoff_borocode Nullable(UInt8),
dropoff_boroname Nullable(String),
dropoff_ct2010 Nullable(String),
dropoff_boroct2010 Nullable(String),
dropoff_cdeligibil Nullable(String),
dropoff_ntacode Nullable(String),
dropoff_ntaname Nullable(String),
dropoff_puma Nullable(String)
) ENGINE = Log;
$ for FILENAME in trips_x*.csv.gz; do
gunzip -c $FILENAME | clickhouse-client --query="INSERT INTO trips FORMAT CSV"
done
I've tried running parquet-tools on the .pq file but it exhausts my systems RAM. I'm not sure if trying to make 100 GB+ PQ files work with this system is a better path than trying to make multiple files in a single folder work. 10 GB+ PQ files have a surprising number of problems with most tooling that supports PQ.
Hey!
Thanks for the details! I'll investigate them once I have some time.