bensu / zig-csv

A CSV parser and serializer for zig with good performance and ergonomics

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

zig-csv

A library to parse CSV files into user-defined structs.

Disclaimer: I haven't used this in production and I don't know how it would fare there. I made this to learn zig.

Quickstart

Install

It is not super clear to me how people install libraries in zig but these instructions have worked for me:

  1. Make a vendor folder in your project and add csv as a git submodule:
$ mkdir vendor
$ git submodule add https://github.com/bensu/csv-zig vendor/csv
$ ls vendor/csv
README.md	build.zig	data		src		test
  1. Add it as a package in your build.zig:
const exe = b.addExecutable("your-app", "src/main.zig");
exe.addPackage(.{
    .name = "csv",
    .source = .{ .path = "vendor/csv/src/csv.zig" },
});

Parse

Consider the following CSV file test/data/pokemon_example.csv:

id,name,captured,color,health,
1,squirtle,false,blue,,
2,charmander,false,red,,
3,pikachu,true,yellow,10.0,

You can define a struct with you expect to find in it and then parse it with an iterator:

const std = @import("std");
const fs = std.fs;

// Import csv
const csv = @import("csv.zig");

const Color = enum { red, blue, green, yellow };

// Define the type of CSV rows as a struct
const Pokemon = struct {
    id: u32,
    name: []const u8,
    captured: bool,
    color: Color,
    health: ?f32,
};

test "parsing pokemon" {
    var file = try fs.cwd().openFile("test/data/pokemon_example.csv", .{});
    defer file.close();
    const reader = file.reader();

    const allocator = std.testing.allocator;
    var arena = std.heap.ArenaAllocator.init(allocator);
    defer arena.deinit();

    const config: csv.CsvConfig = .{}; // use the default config

    const PokemonCsvParser = csv.CsvParser(Pokemon, fs.File.Reader, config);

    var parser = try PokemonCsvParser.init(arena.allocator(), reader);

    var number_captured: u32 = 0;
    while (try parser.next()) |pokemon| {
        if (pokemon.captured) {
            number_captured += 1;
        }
    }

    try std.testing.expectEqual(number_captured, 1);
    std.debug.print("You have captured {} Pokemons", .{number_captured});
}

Serialize

Now, instead of parsing the file, we are going to serialize the same contents from in-memory data into tmp/pokemon.csv:

test "serializing pokemon" {
    var file = try fs.cwd().createFile("tmp/pokemon.csv", .{});
    defer file.close();
    const writer = file.writer();

    const allocator = std.testing.allocator;
    var arena = std.heap.ArenaAllocator.init(allocator);
    defer arena.deinit();

    const config: csv.CsvConfig = .{};
    const PokemonCsvSerializer = csv.CsvSerializer(Pokemon, fs.File.Writer, config);
    var serializer = PokemonCsvSerializer.init(writer);

    const pokemons = [3]Pokemon{
        Pokemon{
            .id = 1,
            .name = "squirtle",
            .captured = false,
            .color = Color.blue,
            .health = null,
        },
        Pokemon{
            .id = 2,
            .name = "charmander",
            .captured = false,
            .color = Color.red,
            .health = null,
        },
        Pokemon{
            .id = 1,
            .name = "pikachu",
            .captured = true,
            .color = Color.yellow,
            .health = 10.0,
        },
    };

    try serializer.writeHeader();

    for (pokemons) |pokemon| {
        try serializer.appendRow(pokemon);
    }
}

tmp/pokemon.csv should now have the same contents as test/data/pokemon_example.csv above, header included.

API Reference

pub const CsvConfig = struct {
    field_end_delimiter: u8 = ',',
    row_end_delimiter: u8 = '\n',
    quote_delimiter: u8 = '"',
    skip_first_row: bool = true,
};

pub fn CsvParser(
    comptime T: type,
    comptime Reader: type,
    comptime config: cnf.CsvConfig,
) type {
    return struct {

        // Create one CsvParser, valid for one pass over the Reader
        fn init(
            allocator: std.mem.Allocator,
            reader: Reader,
        ) CsvParseError!CsvSerializer {}


        // Returns the next row T or null if the iterator is done
        fn next() CsvParseError!?T {}

        // Like next() but writes the struct into the provider pointer
        fn nextInto(struct_pointer: *T) CsvParseError!?*T {}
    }
}


pub fn CsvSerializer(
    comptime T: type,
    comptime Writer: type,
    comptime config: cnf.CsvConfig,
) type {
    return struct {
        fn init(writer: Writer) CsvSerializer {}

        fn writeHeader() WriterError!void {}

        fn appendRow(data: T) WriterError!void {}
    }
}

with the following errors:

// Errors generated by the CsvParser
const CsvParseSpecificError = error{
    BadInput,
    MissingFields,
    ExtraFields,
    OutOfMemory,
};

pub const ReaderError = error { ... }; // from reader.read(), e.g. fs.File.Reader
pub const WriterError = error { ... }; // from writer.write(), e.g. fs.File.Writer

pub const CsvParseError = CsvParseSpecificError || ReaderError;

Example usage:

const config: csv.CsvConfig = {
    .field_end_delimiter = ',',
    .row_end_delimiter = '\n',
    .quote_delimiter = '"',
    .skip_first_row = true,
};

const StructType = struct {
    int_field:   u32,
    float_field: f64,
    str_field:   []const u8,
    enum_field:  enum { red, blue, yellow },
    union_field: union { int_case: i32, float_case: f32 },
    bool_field:  bool,
    maybe_field: ?f64,
    void_field:  void,  // Use to skip parsing certain columns
}

var parser = csv.CsvParser(StructType, fs.File.Reader, config).init(reader);

var total: u32 = 0;
while (try parser.next()) |row| {
    // do something with the row
    if (std.mem.eql(u8, "special_value", row.str_field)) {
        total += row.int_field;
    }
}

var serializer = csv.CsvSerializer(StructType, config).init(writer);

try serializer.writeHeader();
try serializer.appendRow(StructType{ ... });
try serializer.appendRow(StructType{ ... });
// ...

Examples

Parse from one file and serialize into another one

From src/csv/end_to_end.zig:

const T = struct { id: i64, age: u32 };

const from_path = "data/from_file.csv";
var from_file = try fs.cwd().openFile(from_path, .{});
defer from_file.close();
const reader = from_file.reader();

const to_path = "tmp/to_file.csv";
var to_file = try fs.cwd().createFile(to_path, .{});
defer to_file.close();
const writer = to_file.writer();

const allocator = std.testing.allocator;
var arena = std.heap.ArenaAllocator.init(allocator);
defer arena.deinit();

var parser = try csv.CsvParser(T, fs.File.Reader, .{}).init(arena.allocator(), reader);

var serializer = csv.CsvSerializer(T, fs.File.Writer, .{}).init(writer);

var rows: usize = 0;
try serializer.writeHeader();
while (try parser.next()) |row| {
    rows = rows + 1;
    try serializer.appendRow(row);
}

std.debug.print("Wrote {} rows", .{rows});

Parse into a pre-allocated Array

From src/csv/parse.zig:

const T = struct { id: i64, age: u32 };

const file_path = "test/data/simple_end_to_end.csv";
var file = try fs.cwd().openFile(file_path, .{});
defer file.close();
const reader = file.reader();

var arena = std.heap.ArenaAllocator.init(allocator);
defer arena.deinit();
const arena_allocator = arena.allocator();

// if you know how many to rows to expect you can use an Array directly
const expected_rows = 17;
const array: []T = try arena_allocator.alloc(T, expected_rows);

var parser = try csv.CsvParser(T, fs.File.Reader, .{}).init(arena_allocator, reader);

var i: usize = 0;
while (i < expected_rows) {
    _ = try parser.nextInto(&array[i]);
    i += 1;
}

Parse into a pre-allocated ArrayList

From src/csv/parse.zig:

const T = struct { id: i64, age: u32 };

const file_path = "test/data/simple_end_to_end.csv";
var file = try fs.cwd().openFile(file_path, .{});
defer file.close();
const reader = file.reader();

var arena = std.heap.ArenaAllocator.init(allocator);
defer arena.deinit();
const arena_allocator = arena.allocator();

// if you don't know how many rows to expect, you can use ArrayList
var list = std.ArrayList(T).init(allocator);
defer list.deinit();

var parser = try csv.CsvParser(T, fs.File.Reader, .{}).init(arena_allocator, reader);

while (try parser.next()) |row| {
    try list.append(row);
}

Performance: skip fields and re-use strings

To improve performance, you can:

  1. Assign void to the fields that you don't need and the parser will skip them.
  2. Re-use the same memory for the strings of every row, provided you don't need to keep those strings after you processed them.
// 1. We mark void every field we don't need, maintaining their order

const NamelessPokemon = struct {
    id: void,
    name: []const u8,
    captured: bool,
    color: void,
    health: void,
};

var file = try fs.cwd().openFile("test/data/pokemon_example.csv", .{});
defer file.close();
const reader = file.reader();

// 2. We will keep the strings of one row at a time in this buffer
var buffer: [4096]u8 = undefined;
var fba = std.heap.FixedBufferAllocator.init(&buffer);

const PokemonCsvParser = csv.CsvParser(NamelessPokemon, fs.File.Reader, .{});

var parser = try PokemonCsvParser.init(fba.allocator(), reader);

var pikachus_captured: u32 = 0;
while (try parser.next()) |pokemon| {

    // 1. We only use pokemon.captured and pokemon.name, everything else is void
    if (pokemon.captured and std.mem.eql(u8, "pikachu", pokemon.name)) {
        pikachus_captured += 1;
    }

    // 2. We already used the allocated strings (pokemon.name) so we can reset
    //    the memory. If we didn't, we would get an OutOfMemory error when the
    //    FixedBufferAllocator runs out of memory
    fba.reset();
}

std.debug.print("You captured {} Pikachus", .{pikachus_captured});

Parse and serialize directly from buffers

From src/csv/end_to_end_test.zig:

test "buffer end to end" {
    const T = struct { id: u32, name: []const u8 };

    // parse
    const source = "id,name,\n1,none,";
    const n = source.len;

    var parsed_rows: [1]T = undefined;

    var buffer_stream = std.io.fixedBufferStream(source[0..n]);
    const reader = buffer_stream.reader();

    var arena = std.heap.ArenaAllocator.init(std.testing.allocator);
    defer arena.deinit();
    const arena_allocator = arena.allocator();

    var parser = try csv.CsvParser(T, @TypeOf(reader), .{}).init(arena_allocator, reader);

    var i: usize = 0;
    while (try parser.next()) |row| {
        parsed_rows[i] = row;
        i += 1;
    }

    // serialize
    var buffer: [n + 1]u8 = undefined;
    var fixed_buffer_stream = std.io.fixedBufferStream(buffer[0..]);
    const writer = fixed_buffer_stream.writer();

    var serializer = csv.CsvSerializer(T, @TypeOf(writer), .{}).init(writer);

    try serializer.writeHeader();
    for (parsed_rows) |row| {
        try serializer.appendRow(row);
    }

    try std.testing.expect(std.mem.eql(u8, source, buffer[0..n]));
}

Informal benchmarks

In my M1, this library can run over a 144Mb CSV file in 418ms if it parses every column and 301ms if it only extracts a few fields:

$ # get the benchmark data
$ git submodule update --remote benchmark
$ cd benchmark
$ bash unzip_data.bash
$ cd ..
$ # run the benchmark
$ zig build -Drelease-fast=true; zig-out/bin/csv

Starting benchmark
Parsed in 4ms on average     -- bench.NFL               // 1.3MB all columns, 325 MB/s
Parsed in 418ms on average   -- bench.FullPopulation    // 144MB all columns, 344 MB/s
Parsed in 301ms on average   -- bench.Population        // 144MB few columns, 478 MB/s
Parsed in 1ms on average     -- bench.MBTA              // N/A 1ms might be off by 50%
Parsed in 263ms on average   -- bench.Trade             // 150MB all columns, 570 MB/s
Parsed in 117ms on average   -- bench.StateDepartment   //  70MB all columns, 598 MB/s
Number of US-MA population: 5988064 in 420 ms           // 144MB all columns, 342 MB/s
Total population: 2289584999 in 291 ms                  // 144MB few columns, 494 MB/s

I took these benchmark files from these great projects:

Thank you to these authors for compiling the benchmarks!

After running those benchmarks in my computer, this library is on par or slightly better than rust-csv and cpp/csv-parser and around 2x faster than the Java libraries (which makes sense because in zig it is possible to avoid a lot of allocations relative to Java). You can find more info in the benchmarks documentation.

About

A CSV parser and serializer for zig with good performance and ergonomics

License:MIT License


Languages

Language:Zig 100.0%