tursodatabase / libsql

libSQL is a fork of SQLite that is both Open Source, and Open Contributions.

Home Page:https://turso.tech/libsql

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Inserts on child db fails in multi tenant setup

Nipsuli opened this issue · comments

Original discussion in discord

In short inserts to database that was created with linked schema in parent db fails to insert. Originally was getting 500 responses consistently, now I get 400.

Code to reproduce

/* SETUP
turso group create my-default-group --location fra
turso group locations add my-default-group cdg
turso group update my-default-group --extensions all

# Set as TURSO_CONNECTION_URL
turso db create my-schema-db --group my-default-group --type schema

# Set as TURSO_MANAGEMENT_TOKEN
turso auth api-tokens mint local-dev

# Set as TURSO_AUTH_TOKEN
turso group tokens create my-default-group

# My org set as TURSO_ORG=
*/
import "dotenv/config";

import { createClient as createManagementClient } from "@tursodatabase/api";
import { createClient as createDbClient, LibsqlError } from "@libsql/client";

const schemaDbUrl = process.env["TURSO_CONNECTION_URL"];
const authToken = process.env["TURSO_AUTH_TOKEN"];
const managementToken = process.env["TURSO_MANAGEMENT_TOKEN"];
const tursoOrg = process.env["TURSO_ORG"];

if (!schemaDbUrl || !authToken || !managementToken || !tursoOrg) {
  throw new Error("Missing envvars");
}

const myDefaultGroup = "my-default-group";
const mySchemaDb = "my-schema-db";

const SCHEMA = `
CREATE TABLE slack_messages (
	id integer PRIMARY KEY NOT NULL,
	ts text NOT NULL,
	sub_type text NOT NULL,
	hidden integer NOT NULL,
	user_id text NOT NULL,
	channel_id text NOT NULL,
	thread_id text NOT NULL,
	text_content text NOT NULL
);
`;

const main = async () => {
  console.log("Imitating migration");
  const schemaClient = createDbClient({ url: schemaDbUrl, authToken });
  await schemaClient.execute(SCHEMA);

  console.log("Imitating new customer signing up");
  const newDbID = "cdb-foobar";
  const managementClient = createManagementClient({
    org: tursoOrg,
    token: managementToken,
  });
  const newDb = await managementClient.databases.create(newDbID, {
    group: myDefaultGroup,
    schema: mySchemaDb,
  });

  console.log("Imitating new slack messagew arriving in customer workspace");
  const connectionUrl = `libsql://${newDb.hostname}`;
  const dbClient = createDbClient({ url: connectionUrl, authToken });
  const insertMessages = [
    {
      ts: "1713461663.539999",
      subType: "",
      hidden: true,
      userId: "U02QS0PKCQZ",
      channelId: "C06UA1583S6",
      threadId: "1713461663.539999",
      textContent: "aa",
    },
  ];
  await dbClient.batch(
    insertMessages.map((m) => ({
      sql: `INSERT INTO slack_messages (ts, sub_type, hidden, user_id, channel_id, thread_id, text_content)
            VALUES (:ts, :subType, :hidden, :userId, :channelId, :threadId, :textContent)`,
      args: {
        ts: m.ts,
        subType: m.subType,
        hidden: +m.hidden, // boolean to integer
        userId: m.userId,
        channelId: m.channelId,
        threadId: m.threadId,
        textContent: m.textContent,
      },
    })),
    "write",
  );
  console.log("All done");
};

main().catch((e) => {
  console.error("Failed to process", e);
  if (e instanceof LibsqlError) {
    console.error("LibsqlError details:", e.code, e.rawCode, e.message);
  }
  process.exit(1);
});

And I get back:

❯ npx tsx ./failure.ts
Imitating migration
Imitating new customer signing up
Imitating new slack messagew arriving in customer workspace
Failed to process LibsqlError: SERVER_ERROR: Server returned HTTP status 400
    at mapHranaError (/Users/niko/code/reconfigured-v2/node_modules/@libsql/client/lib-cjs/hrana.js:287:16)
    at HttpClient.batch (/Users/niko/code/reconfigured-v2/node_modules/@libsql/client/lib-cjs/http.js:116:48)
    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
    at main (/Users/niko/code/reconfigured-v2/apps/db-writer-consumer-cf/failure.ts:78:3) {
  code: 'SERVER_ERROR',
  rawCode: undefined,
  [cause]: HttpServerError: Server returned HTTP status 400
      at errorFromResponse (/Users/niko/code/reconfigured-v2/node_modules/@libsql/hrana-client/lib-cjs/http/stream.js:367:12)
      at process.processTicksAndRejections (node:internal/process/task_queues:95:5) {
    status: 400
  }
}
LibsqlError details: SERVER_ERROR undefined SERVER_ERROR: Server returned HTTP status 400

After running

npm i dotenv
npm i @tursodatabase/api
npm i @libsql/client

The reproducer fails because it tries to use XMLHttpRequest:

╰──> npx tsx ./failure.ts          
(node:86322) ExperimentalWarning: --experimental-loader is an experimental feature. This feature could change at any time
(Use `node --trace-warnings ...` to show where the warning was created)
Imitating migration
Failed to process ReferenceError: XMLHttpRequest is not defined
    at /Users/haaawk/node_modules/whatwg-fetch/dist/fetch.umd.js:540:17
    at new Promise (<anonymous>)
    at fetch (/Users/haaawk/node_modules/whatwg-fetch/dist/fetch.umd.js:533:12)
    at fetchWithAgentSelection (/Users/haaawk/node_modules/@libsql/isomorphic-fetch/node.cjs:21:12)
    at HttpStream.#flush (/Users/haaawk/node_modules/@libsql/hrana-client/lib-cjs/http/stream.js:248:23)
    at HttpStream.#flushPipeline (/Users/haaawk/node_modules/@libsql/hrana-client/lib-cjs/http/stream.js:236:20)
    at HttpStream.#flushQueue (/Users/haaawk/node_modules/@libsql/hrana-client/lib-cjs/http/stream.js:226:32)
    at /Users/haaawk/node_modules/@libsql/hrana-client/lib-cjs/http/stream.js:184:76
    at node:internal/process/task_queues:141:7
    at AsyncResource.runInAsyncScope (node:async_hooks:199:9)

ok I needed node v18. but then the code just worked fine:

╰──> npx tsx ./failure.ts
Imitating migration
(node:91518) ExperimentalWarning: The Fetch API is an experimental feature. This feature could change at any time
(Use `node --trace-warnings ...` to show where the warning was created)
Imitating new customer signing up
Imitating new slack messagew arriving in customer workspace
All done

Wondering then what's f'd in my setup. As haven't managed to get that thing working in my reconfigured org. Like would be helpful to get something else as a response than SERVER_ERROR without any hint of what's going on.

Now I'm having hard time to replicate this either. I'll get back when I find something more.

One known issue is that shared schema sometimes does not work with replicas. Since your setup has replicas that might be it. This might be the source of non-determinism since routing might get you either to primary or to replica. I will be looking into replica issue tomorrow so maybe I will find what's wrong.

I was thinking if replicas and or extensions would be something that causes this. Did different variations of the setup in https://github.com/Nipsuli/turso-testing but now each one of those succeeded 🤯

Outputs from the different test scripts

With simple parent schema it works.

❯ TURSO_ORG=<redacted> TURSO_MANAGEMENT_TOKEN=<redacted> ./run_simple.sh
v20.9.0
Created group my-simple-debug at fra in 6.261s.
Created database my-simple-debug at group my-simple-debug in 20.35s.

Start an interactive SQL shell with:

   turso db shell my-simple-debug

To see information about the database, including a connection URL, run:

   turso db show my-simple-debug

To get an authentication token for the database, run:

   turso db tokens create my-simple-debug

Imitating migration
Waiting to ensure things have been propagated...
Imitating new customer signing up
Waiting to ensure things have been propagated...
Imitating new slack message arriving in customer workspace
All done

With replica it works

❯ TURSO_ORG=<redacted> TURSO_MANAGEMENT_TOKEN=<redacted> ./run_replica.sh
v20.9.0
Created group my-replica-debug at fra in 7.971s.
Group my-replica-debug replicated to cdg in 4 seconds.
Created database my-replica-debug at group my-replica-debug in 7.885s.

Start an interactive SQL shell with:

   turso db shell my-replica-debug

To see information about the database, including a connection URL, run:

   turso db show my-replica-debug

To get an authentication token for the database, run:

   turso db tokens create my-replica-debug

Imitating migration
Waiting to ensure things have been propagated...
Imitating new customer signing up
Waiting to ensure things have been propagated...
Imitating new slack message arriving in customer workspace
All done

With extensions it works

❯ TURSO_ORG=<redacted> TURSO_MANAGEMENT_TOKEN=<redacted> ./run_extensions.sh
v20.9.0
Created group my-extensions-debug at fra in 6.453s.
To update group my-extensions-debug, all its locations and databases must be updated.
All your active connections to that group will be dropped and there will be a short downtime.

Are you sure you want to do this? [y/n]: y
✔  Success! Group my-extensions-debug was updated successfully
Created database my-extensions-debug at group my-extensions-debug in 3.744s.

Start an interactive SQL shell with:

   turso db shell my-extensions-debug

To see information about the database, including a connection URL, run:

   turso db show my-extensions-debug

To get an authentication token for the database, run:

   turso db tokens create my-extensions-debug

Imitating migration
Waiting to ensure things have been propagated...
Imitating new customer signing up
Waiting to ensure things have been propagated...
Imitating new slack message arriving in customer workspace
All done

With extensions and replica it works...

❯ TURSO_ORG=<redacted> TURSO_MANAGEMENT_TOKEN=<redacted> ./run_extensions_replica.sh
v20.9.0
Created group my-extensions-rep-debug at fra in 7.463s.
Group my-extensions-rep-debug replicated to cdg in 3 seconds.
To update group my-extensions-rep-debug, all its locations and databases must be updated.
All your active connections to that group will be dropped and there will be a short downtime.

Are you sure you want to do this? [y/n]: y
✔  Success! Group my-extensions-rep-debug was updated successfully
Created database my-extensions-rep-debug at group my-extensions-rep-debug in 827ms.

Start an interactive SQL shell with:

   turso db shell my-extensions-rep-debug

To see information about the database, including a connection URL, run:

   turso db show my-extensions-rep-debug

To get an authentication token for the database, run:

   turso db tokens create my-extensions-rep-debug

Imitating migration
Waiting to ensure things have been propagated...
Imitating new customer signing up
Waiting to ensure things have been propagated...
Imitating new slack message arriving in customer workspace
All done

Managed to once get a

Failed to process TursoClientError: cannot create/update/delete database config while there are pending migration on the shared schema `39d948df-94ba-4388-85a1-0359b59a67d5`
    at TursoClient.request (file:///Users/niko/code/turso-testing/node_modules/@tursodatabase/api/dist/index.js:400:13)
    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
    at async DatabaseClient.create (file:///Users/niko/code/turso-testing/node_modules/@tursodatabase/api/dist/index.js:249:22)
    at async main (file:///Users/niko/code/turso-testing/index.mjs:46:17) {
  status: 400
}

and added wait times between steps to get rid of that

It is possible that we fixed the bug you've seen before already.

Regarding waiting. We have a special HTTP API to wait for a migration task to finish.
You can see a golang example of calling that API here:

package integrationtests

import (
	"context"
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"sync"
	"testing"
	"time"

	qt "github.com/frankban/quicktest"
)

type migrations struct {
	Id     int    `json:"job_id"`
	Status string `json:"status"`
}

func fetchLastMigrationJobId(t *testing.T, url string, authToken string) (int, bool) {
	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
	defer cancel()
	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
	if err != nil {
		t.Fatal(err)
	}

	if authToken != "" {
		req.Header.Set("Authorization", "Bearer "+authToken)
	}

	resp, err := http.DefaultClient.Do(req)
	if err != nil {
		t.Fatal(err)
	}
	defer resp.Body.Close()
	body, err := io.ReadAll(resp.Body)
	if err != nil {
		t.Fatal(err)
	}
	if resp.StatusCode != http.StatusOK {
		t.Fatal("unexpected status code: ", resp.StatusCode)
	}
	type httpResults struct {
		SchemaVersion int          `json:"schema_version"`
		Migrations    []migrations `json:"migrations"`
	}
	var results httpResults

	err = json.Unmarshal(body, &results)
	if err != nil {
		t.Fatal(err)
	}
	if len(results.Migrations) == 0 {
		t.Fatal("No migration jobs found")
	}
	jobId := -1
	status := ""
	for _, migration := range results.Migrations {
		if migration.Id > jobId {
			jobId = migration.Id
			status = migration.Status
		}
	}
	if status == "RunFailure" {
		t.Fatal("Migration job failed")
	}
	return jobId, status == "RunSuccess"
}

func isMigrationJobFinished(t *testing.T, url string, authToken string, jobId int) bool {
	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
	defer cancel()
	req, err := http.NewRequestWithContext(ctx, "GET", fmt.Sprint(url, "/", jobId), nil)
	if err != nil {
		t.Fatal(err)
	}

	if authToken != "" {
		req.Header.Set("Authorization", "Bearer "+authToken)
	}

	resp, err := http.DefaultClient.Do(req)
	if err != nil {
		t.Fatal(err)
	}
	defer resp.Body.Close()
	body, err := io.ReadAll(resp.Body)
	if err != nil {
		t.Fatal(err)
	}
	if resp.StatusCode != http.StatusOK {
		t.Fatal("unexpected status code: ", resp.StatusCode)
	}
	var job migrations
	err = json.Unmarshal(body, &job)
	if err != nil {
		t.Fatal(err)
	}
	if job.Status == "RunFailure" {
		t.Fatal("Migration job failed")
	}
	return job.Status == "RunSuccess"
}

func waitForSchemaMigrationToFinish(t *testing.T, db *Db) {
	url := db.HttpURL() + "/v1/jobs"
	authToken := db.AuthToken()
	jobId, finished := fetchLastMigrationJobId(t, url, authToken)
	if finished {
		return
	}
	step := 0
	for !isMigrationJobFinished(t, url, authToken, jobId) {
		step += 1
		if step > 30 {
			t.Fatal("Schema migration did not finish in time")
		}
		time.Sleep(1 * time.Second)
	}
}

func TestSharedSchema(t *testing.T, canary bool) {
	testSharedSchema(t, canary, 1)
}

func TestSharedSchemaMultipleDbs(t *testing.T, canary bool) {
	count := 5
	if canary {
		count = 80
	}
	testSharedSchema(t, canary, count)
}

func testSharedSchema(t *testing.T, canary bool, dbCount int) {
	if dbCount < 1 {
		t.Fatal("dbCount should be at least 1")
	}
	if dbCount > 9 && dbCount%10 != 0 {
		t.Fatal("dbCount should be divisible by 10")
	}
	c := qt.New(t)
	groupName := NewName()
	primaryRegion := "iad"
	group := NewGroup(c, groupName, &primaryRegion, canary)
	group.Create("")
	t.Cleanup(func() {
		group.Destroy(false)
	})

	parentDb := NewSchemaDb(c, group, nil, NewName(), canary, true, "")
	parentDb.Create()

	var children []*Db
	if dbCount < 10 {
		for i := 0; i < dbCount; i++ {
			childDb := NewSchemaDb(c, group, nil, NewName(), canary, false, parentDb.Name)
			childDb.Create()
			children = append(children, childDb)
		}
	} else {
		ch := make(chan *Db, dbCount)
		for i := 0; i < 10; i++ {
			go func() {
				for i := 0; i < dbCount/10; i++ {
					childDb := NewSchemaDb(c, group, nil, NewName(), canary, false, parentDb.Name)
					childDb.Create()
					ch <- childDb
					// sleep to avoid hitting the rate limit
					time.Sleep(2 * time.Second)
				}
			}()
		}
		for i := 0; i < dbCount; i++ {
			childDb := <-ch
			children = append(children, childDb)
		}
	}

	// test schema propagation
	parentDb.Exec("CREATE TABLE test (id INT)")

	// Wait for the schema to migrate
	waitForSchemaMigrationToFinish(t, parentDb)

	var wg sync.WaitGroup
	for i := 0; i < 10; i++ {
		wg.Add(1)
		go func(i int) {
			defer wg.Done()
			for idx, childDb := range children {
				if idx%10 == i {
					childDb.Exec("SELECT * FROM test")
					childDb.Exec("insert into test (id) values (1)")
				}
			}
		}(i)
	}
	wg.Wait()

	// TODO
	// right now we don't get the proper error
	// Error: driver: bad connection: failed to read JSON message: failed to get reader: failed to read frame header: EOF
	// updating schema on child db should fail
	children[0].ExecExpectErr("CREATE TABLE test2 (id INT)", true)
	children[0].ExecExpectErr("DROP TABLE test", true)

	// test data propagation
	parentDb.Exec("insert into test (id) values (42)")

	// Wait for the data to migrate
	waitForSchemaMigrationToFinish(t, parentDb)

	for i := 0; i < 10; i++ {
		wg.Add(1)
		go func(i int) {
			defer wg.Done()
			for idx, childDb := range children {
				if idx%10 == i {
					c.Assert(childDb.Exec("SELECT * FROM test"), qt.Contains, "42")
				}
			}
		}(i)
	}
	wg.Wait()

	// creating a new db should have the schema and the data
	newDb := NewSchemaDb(c, group, nil, NewName(), canary, false, parentDb.Name)
	newDb.Create()
	c.Assert(newDb.Exec("SELECT * FROM test"), qt.Contains, "42")
}

It is possible that we fixed the bug you've seen before already.

That's possible.

One known issue is that shared schema sometimes does not work with replicas.

Now I'm running without replicas, as I'm not really needing them in the current setup.

As I'm not able to replicate this. And I have things working in my setup now with fresh db group. I think this can be closed.

Thank you!

Thanks @Nipsuli ! Please don't hesitate to open another issue if you spot anything wrong with the schema migration feature.