cghiban / custom-blast-db

a set of tools for building custom NCBI BLAST databases

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

####
@2019-05-03


0. (optional) make sure you've loaded older data into sqlite db (see step #6 how to do this)

1. search for new data
  - edit UTIL_PHY_BLAST
  - run bin/phy_blast_get_ids.pl

  * the result of this is a list of accessions sorted by date (newest first) 
    saved into the specified directory

    # export the work directory so we can re-use it later
    $ export RUNDIR=$(grep fasta bin/UTIL_PHY_BLAST|sed -e"s/.*'\(.*\)'.*/\1/")

2. find new accessions using bin/find_new_accns.pl, eg:

    ## load the db into the cache
    $ vmtouch -vt tmpfs/test4.db

    $ parallel --jobs 4 "cat {} | bin/find_new_accns.py tmpfs/test4.db >{.}.new" ::: $RUNDIR/250-*.txt
    or
    $ parallel --jobs 5 "bin/find_new_accns.py tmpfs/test4.db <{} >{.}.new" ::: $RUNDIR/250-*.txt
    ###$ parallel --jobs 2 "cat {} | perl bin/find_new_accns.pl tmpfs/test2.db  >{.}.new" ::: $RUNDIR/300-0*.new
    
    - new accessions are stored into *.new files (some will be empty)

    ## evict the db from the cache
    $ vmtouch -ve tmpfs/test4.db

3. remove empty *.new files

    $ find $RUNDIR/ -size 0c -exec rm  {} \;
    or
    $ find $RUNDIR/ -name \*.new -size 0c |xargs rm

4. get data for the new accessions in genbank format

    $ parallel --jobs 2 bin/phy_blast_get_accn_data.sh {} gb ::: $RUNDIR/250-*.new

5. check if have all the data:

    $ grep ^LOCUS $RUNDIR/250-0*.gb|wc -l
    $ cat $RUNDIR/250-0*.new|wc -l
    or
    for i in $(ls $RUNDIR/250-0*.gb); do 
        x=$(grep ^LOCUS $i|wc -l); 
        new=${i/.gb/.new};
        n=$(cat $new|wc -l);
        [[ "$n" == "$x" ]] || echo -e "\trm $i && bin/phy_blast_get_accn_data.sh $new gb\t\"$x != $n\"";
    done

    ## vew any missing accns
    $ vimdiff <(cat $RUNDIR/250-0*.new|sort) <(grep ^VERSION $RUNDIR/250-0*.gb|sed -e's/\s\+/\t/g'|cut -f2|sort)


6. load new data into SQLite db:

    #$ parallel --jobs 2 perl bin/load_db.pl tmpfs/test2.db  {} ::: fasta-20180323-COI/300-*-new.gb
    #
    # make sure you set the right locale
    $ LANG=en_US.utf8 parallel --jobs 2 perl bin/load-gb.py tmpfs/test4.db  {} ::: $RUNDIR/250-*.gb

7. make a backup of the database

    $ pigz -p8 -c tmpfs/test4.db > test4-$(date +"%Y%m%d").db.gz

8. extract fasta 
    based on all accession number from #1, extract all data based on what we have in the database

    #- search for all the accession numbers in the given file, group them by file, 
    #  and return only the data for those accessions
    #$ parallel --jobs 2 \
    #    perl bin/extract_gb_using_db.pl tmpfs/test2.db {}  ">"{.}.fasta \
    #    ::: $(ls fasta-20180323-COI/300-0*.txt|grep -v new.txt)
    #
    # better/faster
    time parallel --jobs 8 python3 -W ignore bin/dump-fasta.py  tmpfs/test4.db {} ">"{.}.fasta ::: $RUNDIR/250-*.txt
    #
    #or, if we have too many *txt files and get "Argument list too long" error:
    ls $RUNDIR|grep "\.txt$"| parallel --jobs 8 python3 -W ignore bin/dump-fasta.py  tmpfs/test4.db $RUNDIR/{} ">"$RUNDIR/{.}.fasta

9. remove empty fasta files

    $ find $RUNDIR/ -size 0c -exec rm  {} \;


*** TO BUILD the Qiime2 database continue to A1. ***
*** TO BUILD the Blue line blast database continue to B1. ***

A1. we need to use biosql now

    $ cd ~/work/dnalc/biosql/ # on gigel
    $ mkdir -p output && rm -v ./output/*
    $ scp compute-server:/data/zmeu_restore/data/phy_blast/fasta-20180323-COI-vertebrates.fasta .
    $ time perl test-local-lineage.pl .
    #  or
    $ time bin/build-taxonomy.pl fasta-20180323-COI/
    $ ls -lh ./output/

A2. create the database/qiime2 (we'll use qiime2-docker, the version matters)

    $ scp output/* greuceanu:/data/ghiban/qiime-coi
    $ docker run --rm -ti -v $PWD:/data:Z --entrypoint /bin/bash dnasubway-pl
    # ls
    fs_otu_taxonomy.txt  fs_otus.txt  tmp

    # time qiime tools import --type 'FeatureData[Sequence]' --input-path fs_otus.txt --output-path otus.qza
    # qiime tools import --type 'FeatureData[Taxonomy]'  --source-format HeaderlessTSVTaxonomyFormat --input-path fs_otu_taxonomy.txt --output-path ref-taxonomy.qza

    # XXX this may take a long time and it's optional
    # qiime feature-classifier extract-reads --i-sequences otus.qza --p-f-primer GGWACWGGWTGAACWGTWTAYCCYCC --p-r-primer GGRGGRTASACSGTTCASCCSGTSCC [--p-trunc-len NNN] --verbose --o-reads ref_seqs_coi_vertebrates.qza

    # time qiime feature-classifier fit-classifier-naive-bayes --i-reference-reads ref_seqs_coi_vertebrates.qza --i-reference-taxonomy ref-taxonomy.qza --o-classifier coi-vertebrates-2017.8.qza

    # save the history
    # history > history.txt

    # exit # at this point the docker container is destroid

    # add this file to the purple line config[.live]/UBIOME
    $ ls -lh coi-vertebrates-2017.8.qza


*** BLAST DB ***

B1. filter all the fasta (only 5 duplicate sequences per organism)

    $ time perl bin/filter_fasta_to_stdout.pl $RUNDIR/ > fasta-$(date +"%Y%m%d")-filtered.fasta

B2. create the database

    $ makeblastdb \
        -logfile makeblastdb-$(date +"%Y%m%d-%H%M").log \
        -title blueline-$(date +"%Y%m") \
        -parse_seqids \
        -dbtype nucl \
        -input_type fasta \
        -in fasta-$(date +"%Y%m%d")-filtered.fasta \
        -out phydb$(date +"%Y%m")

B3. copy it to the blast server

    $ scp  -P 1234  phydb201711.* admin@blast-server:blastdb/

# NOTE: sqlite schema

CREATE TABLE entries (
            id rowid,
            accession text not null,
            version text not null,
            organism text not null,
            modified_on date not null,
            file text not null,
            UNIQUE (version)
        );

About

a set of tools for building custom NCBI BLAST databases


Languages

Language:Perl 77.8%Language:Python 16.4%Language:Shell 5.8%