Scaling KAZU to run over millions of abstracts

Question

Scaling KAZU to run over millions of abstracts

kajocina opened this issue 6 months ago · comments

Hi, thanks again for sharing KAZU, it looks like exactly the tool I need for setting up a simple biomed NLP system.

I am planning on using KAZU to add ontology terms to the whole PubMed abstract dump and writing these out to disk. I am a bit concerned that it will not scale well using the approach you show with single document in the tutorial (EGFR query). However, your Ray tutorial page still has TBA.

Do you have some general suggestions for how to scale it for my task? Don't need specific code examples, unless you already have something to share, could be even "quick and dirty".

Richard Jackson · Answer 1 · Tue Jan 30 2024 19:17:54 GMT+0800 (China Standard Time)

Hiya!

Yes sorry about the delay in producing the Ray tutorial. To answer your question, You have multiple options for scaling Kazu on a large dataset. It integrates quite well with the Spark .mapPartitions concept, and also Ray. How do you intend to serialise the results? MongoDb or on to disk?

Piotr G · Answer 2 · Tue Jan 30 2024 19:38:52 GMT+0800 (China Standard Time)

I wanted to start simple with just dumping data to disk as jsons. Would there be a large speed advantage by using MongoDB instead?

Richard Jackson · Answer 3 · Tue Jan 30 2024 20:21:26 GMT+0800 (China Standard Time)

it depends on your set up. Mongo is a great way of centralising the data, so that it can be easily queried afterwards (Elasticsearch is good too). However, the DB can be a bottleneck, if you have many Kazu workers and only a single Mongo instance.

In terms of batch processing with Ray, I've found it's useful to use the ray Queue concept

You can then wrap Kazu in ray actors, as follows

import logging
from typing import Iterable, Protocol
from typing import List, Any, cast, Optional

import ray
from hydra.utils import instantiate
from omegaconf import DictConfig
from ray import ObjectRef
from ray.util.queue import Queue, Full, Empty
from tqdm import tqdm
from kazu.pipeline import Pipeline
from kazu.data.data import Document

logger = logging.getLogger(__name__)


def await_put(queue: Queue, items: Any, timeout: Optional[int] = 1) -> None:
    attempt = 1
    while True:
        try:
            queue.put(items, block=True, timeout=timeout)
            return
        except Full:
            logger.info(f"cannot put into queue: Full. Attempt: {attempt}")
            attempt += 1


def await_get(queue: Queue, timeout: Optional[int] = 1) -> List[Document]:
    attempt = 1
    while True:
        try:
            items = cast(list[Document], queue.get(block=True, timeout=timeout))
            return items
        except Empty:
            logger.info(f"cannot get from queue: Empty. Attempt: {attempt}")
            attempt += 1

class DocumentLoader(Protocol):
    """Protocol to load documents from a source, and converts them into
    :class:`.Document`."""

    def load(self) -> Iterable[List[Document]]:
        """Convert documents from a source into :class:`.Document`, and yield a list."""
        ...

    def batch_size(self) -> int:
        """Number of documents produced per batch."""
        ...

    def total_documents(self) -> Optional[int]:
        """Total Documents in this data source, if known."""
        ...

    def total_batches_if_available(self) -> Optional[int]:
        maybe_total = self.total_documents()
        if maybe_total is not None:
            total = int(maybe_total / self.batch_size())
        else:
            total = None
        return total

class RayPipelineQueueWorker:


    def __init__(self, cfg: DictConfig):
        self.cfg = cfg
        self.pipeline: Pipeline = instantiate(cfg.Pipeline)
        self.logger = logging.getLogger(__name__)
        self.in_queue: Optional[Queue] = None
        self.out_queue: Optional[Queue] = None

    def run(self) -> None:
        while True:
            docs: list[Document] = await_get(queue=self.in_queue, timeout=None)  # type: ignore[arg-type]
            self.pipeline(docs)
            await_put(self.out_queue, docs, timeout=None)  # type: ignore[arg-type]

    def set_queues(self, in_queue: Queue, out_queue: Queue) -> None:
        self.in_queue = in_queue
        self.out_queue = out_queue


class RayBatchRunner:


    def __init__(
            self,
            cfg: DictConfig,
    ):
        self.cfg = cfg
        ray.init(num_cpus=cfg.batch_processing.worker_count)
        # write your own implementation of a document loader, to create Kazu documents from source data
        self.loader: DocumentLoader = instantiate(cfg.batch_processing.document_loader)
        self.in_queue: Queue = instantiate(cfg.batch_processing.in_queue)
        self.out_queue: Queue = instantiate(cfg.batch_processing.out_queue)
        self.workers: List[ObjectRef[None]] = self.instantiate_workers()

    def instantiate_workers(self) -> List[ObjectRef]:  # type: ignore[type-arg]
        worker_refs = []
        for i in range(self.cfg.batch_processing.worker_count):
            worker_refs.append(self._instantiate_worker())
        return worker_refs

    def _instantiate_worker(self) -> ObjectRef:  # type: ignore[type-arg]
        PipelineQueueWorkerActor = ray.remote(RayPipelineQueueWorker)
        PipelineQueueWorkerActor.options(num_cpus=1)  # type: ignore[attr-defined]

        worker: ObjectRef = PipelineQueueWorkerActor.remote(self.cfg)  # type: ignore[type-arg,call-arg]
        worker.set_queues.remote(self.in_queue, self.out_queue)  # type: ignore[attr-defined]
        task: ObjectRef = worker.run.remote()  # type: ignore[type-arg,attr-defined]
        print("worker started")
        return task

    def start(self) -> Iterable[List[Document]]:
        docs_wanted = 0
        responses = 0
        for i, docs in enumerate(
                tqdm(self.loader.load(), smoothing=0.1, total=self.loader.total_batches_if_available())
        ):
            docs_wanted += len(docs)
            await_put(queue=self.in_queue, items=docs, timeout=5)
            # handle processed batches of docs here - perhaps with additional actors or simply write to disk
            result = await_get(queue=self.out_queue, timeout=5)
            responses += len(result)
            yield result

        while docs_wanted != responses:
            logger.info("awaiting final batches: %s / %s", docs_wanted, responses)
            result = await_get(queue=self.out_queue, timeout=None)
            responses += len(result)
            yield result

Piotr G · Answer 4 · Tue Jan 30 2024 22:02:51 GMT+0800 (China Standard Time)

For speed over millions of abstracts, if I use Ray on a single multi-cpu node (essentially mimic multiprocessing) but without multiple GPUs, do you think Kazu can still scale reasonably to my task?

Richard Jackson · Answer 5 · Tue Feb 13 2024 21:15:31 GMT+0800 (China Standard Time)

yes kazu is designed to run without a GPU