#### @2019-05-03 0. (optional) make sure you've loaded older data into sqlite db (see step #6 how to do this) 1. search for new data - edit UTIL_PHY_BLAST - run bin/phy_blast_get_ids.pl * the result of this is a list of accessions sorted by date (newest first) saved into the specified directory # export the work directory so we can re-use it later $ export RUNDIR=$(grep fasta bin/UTIL_PHY_BLAST|sed -e"s/.*'\(.*\)'.*/\1/") 2. find new accessions using bin/find_new_accns.pl, eg: ## load the db into the cache $ vmtouch -vt tmpfs/test4.db $ parallel --jobs 4 "cat {} | bin/find_new_accns.py tmpfs/test4.db >{.}.new" ::: $RUNDIR/250-*.txt or $ parallel --jobs 5 "bin/find_new_accns.py tmpfs/test4.db <{} >{.}.new" ::: $RUNDIR/250-*.txt ###$ parallel --jobs 2 "cat {} | perl bin/find_new_accns.pl tmpfs/test2.db >{.}.new" ::: $RUNDIR/300-0*.new - new accessions are stored into *.new files (some will be empty) ## evict the db from the cache $ vmtouch -ve tmpfs/test4.db 3. remove empty *.new files $ find $RUNDIR/ -size 0c -exec rm {} \; or $ find $RUNDIR/ -name \*.new -size 0c |xargs rm 4. get data for the new accessions in genbank format $ parallel --jobs 2 bin/phy_blast_get_accn_data.sh {} gb ::: $RUNDIR/250-*.new 5. check if have all the data: $ grep ^LOCUS $RUNDIR/250-0*.gb|wc -l $ cat $RUNDIR/250-0*.new|wc -l or for i in $(ls $RUNDIR/250-0*.gb); do x=$(grep ^LOCUS $i|wc -l); new=${i/.gb/.new}; n=$(cat $new|wc -l); [[ "$n" == "$x" ]] || echo -e "\trm $i && bin/phy_blast_get_accn_data.sh $new gb\t\"$x != $n\""; done ## vew any missing accns $ vimdiff <(cat $RUNDIR/250-0*.new|sort) <(grep ^VERSION $RUNDIR/250-0*.gb|sed -e's/\s\+/\t/g'|cut -f2|sort) 6. load new data into SQLite db: #$ parallel --jobs 2 perl bin/load_db.pl tmpfs/test2.db {} ::: fasta-20180323-COI/300-*-new.gb # # make sure you set the right locale $ LANG=en_US.utf8 parallel --jobs 2 perl bin/load-gb.py tmpfs/test4.db {} ::: $RUNDIR/250-*.gb 7. make a backup of the database $ pigz -p8 -c tmpfs/test4.db > test4-$(date +"%Y%m%d").db.gz 8. extract fasta based on all accession number from #1, extract all data based on what we have in the database #- search for all the accession numbers in the given file, group them by file, # and return only the data for those accessions #$ parallel --jobs 2 \ # perl bin/extract_gb_using_db.pl tmpfs/test2.db {} ">"{.}.fasta \ # ::: $(ls fasta-20180323-COI/300-0*.txt|grep -v new.txt) # # better/faster time parallel --jobs 8 python3 -W ignore bin/dump-fasta.py tmpfs/test4.db {} ">"{.}.fasta ::: $RUNDIR/250-*.txt # #or, if we have too many *txt files and get "Argument list too long" error: ls $RUNDIR|grep "\.txt$"| parallel --jobs 8 python3 -W ignore bin/dump-fasta.py tmpfs/test4.db $RUNDIR/{} ">"$RUNDIR/{.}.fasta 9. remove empty fasta files $ find $RUNDIR/ -size 0c -exec rm {} \; *** TO BUILD the Qiime2 database continue to A1. *** *** TO BUILD the Blue line blast database continue to B1. *** A1. we need to use biosql now $ cd ~/work/dnalc/biosql/ # on gigel $ mkdir -p output && rm -v ./output/* $ scp compute-server:/data/zmeu_restore/data/phy_blast/fasta-20180323-COI-vertebrates.fasta . $ time perl test-local-lineage.pl . # or $ time bin/build-taxonomy.pl fasta-20180323-COI/ $ ls -lh ./output/ A2. create the database/qiime2 (we'll use qiime2-docker, the version matters) $ scp output/* greuceanu:/data/ghiban/qiime-coi $ docker run --rm -ti -v $PWD:/data:Z --entrypoint /bin/bash dnasubway-pl # ls fs_otu_taxonomy.txt fs_otus.txt tmp # time qiime tools import --type 'FeatureData[Sequence]' --input-path fs_otus.txt --output-path otus.qza # qiime tools import --type 'FeatureData[Taxonomy]' --source-format HeaderlessTSVTaxonomyFormat --input-path fs_otu_taxonomy.txt --output-path ref-taxonomy.qza # XXX this may take a long time and it's optional # qiime feature-classifier extract-reads --i-sequences otus.qza --p-f-primer GGWACWGGWTGAACWGTWTAYCCYCC --p-r-primer GGRGGRTASACSGTTCASCCSGTSCC [--p-trunc-len NNN] --verbose --o-reads ref_seqs_coi_vertebrates.qza # time qiime feature-classifier fit-classifier-naive-bayes --i-reference-reads ref_seqs_coi_vertebrates.qza --i-reference-taxonomy ref-taxonomy.qza --o-classifier coi-vertebrates-2017.8.qza # save the history # history > history.txt # exit # at this point the docker container is destroid # add this file to the purple line config[.live]/UBIOME $ ls -lh coi-vertebrates-2017.8.qza *** BLAST DB *** B1. filter all the fasta (only 5 duplicate sequences per organism) $ time perl bin/filter_fasta_to_stdout.pl $RUNDIR/ > fasta-$(date +"%Y%m%d")-filtered.fasta B2. create the database $ makeblastdb \ -logfile makeblastdb-$(date +"%Y%m%d-%H%M").log \ -title blueline-$(date +"%Y%m") \ -parse_seqids \ -dbtype nucl \ -input_type fasta \ -in fasta-$(date +"%Y%m%d")-filtered.fasta \ -out phydb$(date +"%Y%m") B3. copy it to the blast server $ scp -P 1234 phydb201711.* admin@blast-server:blastdb/ # NOTE: sqlite schema CREATE TABLE entries ( id rowid, accession text not null, version text not null, organism text not null, modified_on date not null, file text not null, UNIQUE (version) );