eugeneware / jsonquery

MongoDB query language implemented as a node.js Stream

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

level-search

dominictarr opened this issue · comments

hi,

I just noticed this module yesterday,
which was exciting, because I had just implemented a
search api for level.

basically, each pair of keys in every document are indexed:

{ name: 'foo',
  repository: { type: 'git', url: 'https://github.com/foofoo/foo.git'},
  dependencies: {
    bar: '1.0.0'
  }
}

would index that file by the pairs:

name foo
repository type
repository url
type git
url https://github.com/dominictarr/level-search
dependencies bar
bar 1.0.0

this makes it possible to do nearly any search,
you could probably get really clever, but as a first step,
I just search on the pair that seems the least likely,
and then filter.

is there a way to get the keys involved in the query from jsonquery so that I could combine it with level-search?

With LevelUp you can specify whether you'd like the levelup keys and values to be present in the pipe(). If keys is true then the key can be used for the filtering.

Here's an example:

var levelup = require('levelup')
  , rimraf = require('rimraf')
  , jsonquery = require('jsonquery');

var dbPath = __dirname + '/data/mydb';

rimraf.sync(dbPath);
var db = levelup(dbPath, {
  valueEncoding: 'json'
});

var batch = [];
for (var i = 0; i < 100; i++) {
  var obj = {
    name: 'name ' + i,
    num: 10*i
  };
  batch.push({ type: 'put', key: i, value: obj });
}

db.batch(batch, function (err) {
  query();
});

function query() {
  db
    .createReadStream({ keys: true, values: true })
    .pipe(jsonquery({ key: { $gte: 90 }, 'value.num': { $mod: [ 20, 0 ]  } }))
    .on('data', console.log);
}

Which will print out:

$ node test.js
{ key: '90', value: { name: 'name 90', num: 900 } }
{ key: '92', value: { name: 'name 92', num: 920 } }
{ key: '94', value: { name: 'name 94', num: 940 } }
{ key: '96', value: { name: 'name 96', num: 960 } }
{ key: '98', value: { name: 'name 98', num: 980 } }

Is that the sort of thing you were after?

In any case you can see the full lists of tests here

As long as the keys and the values are a javascript objects and the data objects from the stream is a javascript object, then jsonquery will be able to filter it.

Hmm, I think I didn't explain level-search very well.
level-search indexes json documents in levelup by each pair of keys.
so, if I want to search for objects that have {name: 'jsonquery'} (i.e. get the package.json for all versions of jsonquery,

That is only 5 versions, but if I was just gonna use jsonquery against the whole of npm... that is 172000 package.json
(per module version) of course. Clearly, it's gonna be way faster to query just 5 objects.

So, If i did a query, {'value.name': 'jsonquery', 'dist.mtime': '$max'} hmm, I need to figure out which are the best keys to use for the query. (in this case: "name", "jsonquery" would be the right choice)

So, I was wondering if there was an easy way that jsonquery could give the pairs I want to look for,
like is there an intermediate data structure jsonquery creates that I can use?

Am i making sense?

OK. I see what you mean. You want to take a JSON query, and then get a list of the keys that would be used for the query.

There's nothing really in JSON query in terms of internal data structures that would help you.

But I was in the process of writing a query optimizer system (JSONQueryOptimizer - not released yet) that would feed into this, so you could define indexes on level up (do an ensureIndex on a field in your JSON data, have levelup index it using a sublevel), and then pass a JSON query, with a list of indexes, and it would tell you what indexes to use, then feed the stream of potential data into JSON query to get the actual results.

Here's what I was working on (sorry it's on coffeescript, was going through a phase 😄)

# query optimizer
optimizer = (indexes, query, callback) ->
  plan = []
  task = optimizerMatch indexes, query
  plan.push task if task

  # console.log util.inspect(plan, false, null)

  callback null, plan

optimizerMatch = (indexes, predicate) ->
  for n, v of predicate
    # LHS $and, $or, $not
    if n[0] == '$'
      task = optimizerOperator indexes, n, v
      return task if task
    # complex RHS predicate (eg. $in, $gt)
    else if v.constructor == Object
      task = optimizeValOpMatch(indexes, n, v)
      return task if task
    # simple lookup
    else
      task = {}
      task[n] ?= []
      task[n].push v
      return task if n in indexes

optimizerOperator = (indexes, op, predicate) ->
  switch op
    when '$or', '$and'
      tasks = []
      tableScan = false

      for part in predicate
        t = optimizerMatch(indexes, part)

        if t is undefined and op == '$or'
          tableScan = true
          break
        else
          tasks.push t if t

      if tableScan
        return null
      else if tasks.length > 1
        task = {}
        task[op] = tasks
        return task
      else if tasks.length == 1
        return tasks[0]

    when '$not'
      task = optimizerMatch(indexes, predicate)
      return task if task

# process complex RHS predicates (eg. $in, $gt)
optimizeValOpMatch = (indexes, val, predicate) ->
  for n, v of predicate
    # keys must be an operator
    if n[0] == '$'
      task = optimizeValOp(indexes, n, val, v)
      return task if task

# operators on the RHS of queries
optimizeValOp = (indexes, op, val, args) ->
  switch op
    when '$in'
      task = {}
      task[val] = args
      return task if val in indexes

    when '$nin', '$gt', '$gte', '$ne', '$lt', '$lte', '$mod'
      # can't use indexes
      task = {}
      task[val] = null #index scan
      return task if val in indexes

    when '$all'
      if val in indexes
        tasks = []
        for part in args
          task = {}
          task[val] = part
          tasks.push task

        if tasks.length > 1
          task = { $and: tasks }
          return task
        else if tasks.length == 1
          return tasks[0]

    when '$elemMatch'
      tasks = []
      for part, v of args
        key = [ val, part ].join('.')
        if key in indexes
          task = {}
          task[key] = v
          tasks.push task

      if tasks.length > 1
        task = { $and: tasks }
        return task
      else if tasks.length == 1
        return tasks[0]

    when '$or', '$and'
      tasks = []
      for part in args
        task = optimizeValOpMatch(indexes, val, part)
        tasks.push task if task

      if tasks.length > 1
        task = {}
        task[op] = tasks
        return task
      else if tasks.length == 1
        return tasks[0]

    when '$not'
      task = optimizeValOpMatch(indexes, val, args)
      return task if task

module.exports = optimizer

used like this:

var jqo = require('./lib/jsonqueryoptimizer');

var indexes = ['name', 'phone', 'num'];
var query = { $and: [ { name: 'Eugene' }, { num: 423 } ] };

jqo(indexes, query, function (err, plan) {
  // plan is: [{"$and":[{"name":["Eugene"]},{"num":[423]}]}] 
  // which is: 
  //    get a stream using the "name" index (looking up with key "Eugene"), and
  //   then AND it with the stream using the num index with key 423
});

Probably not what you need, but it might be a starting point to get what you need.