Clover genotype analysis supplementary

27/02/2019 - 15:37:17

Introduction
GBS data analysis
PSMC Pipeline
MSMC pipeline
Simulations
Divergent genes pipeline
- FASTafilter
- dNdS script
Gene annotation pipeline
HyLiTE pipeline

Introduction

This document has all of the scripts used for the Clover genotype analysis. The results are included in a separate document. Each step of the process has its own section and code in the correct order it was run. #GBS genotyping The GBS genotyping of the 200 clover individuals. The main workflow script is a gwf pipeline (http://gwf.readthedocs.io/en/latest/), which keeps track of all job dependencies and submits the scripts in the correct order.

from gwf import Workflow
gwf = Workflow()
## Genotyping
## SNPS
gwf.target("Genotyper", outputs = ['Clover_SP_dbSNP_V1.1.final.vcf'],
           cores=2, memory='32g', walltime="240:00:00", account="NChain") << """
./Unifiedgenotyper.sh
"""
## ALL_SITES
gwf.target("Genotyper_all", outputs = ['Clover_SP_dbSNP_V1.1.final.all.vcf'],
           cores=2, memory='32g', walltime="240:00:00", account="NChain") << """
./Unifiedgenotyperall.sh
"""
## SNPS
gwf.target("SelectVariants",
           inputs=['Clover_SP_dbSNP_V1.1.final.vcf'],
           outputs = ['Clover_SP_dbSNP_V1.1.clean.vcf'],
           cores=1, memory='32g', walltime="08:00:00", account="NChain") << """
./Selectvariants.sh
"""
gwf.target("SelectVariantsMAF5",
           inputs=['Clover_SP_dbSNP_V1.1.clean.vcf'],
           outputs=['Clover_SP_dbSNP_V1.1.MAF.vcf'],
           cores=1, memory='32g', walltime="08:00:00", account="NChain") << """
./SelectvariantsMAF5.sh
"""
## ALL_SITES
gwf.target("SelectVariants_all",
           inputs=['Clover_SP_dbSNP_V1.1.final.all.vcf'],
           outputs = ['Clover_SP_dbSNP_V1.1.clean.all.vcf'],
           cores=1, memory='32g', walltime="08:00:00", account="NChain") << """
./Selectvariantsall.sh
"""
gwf.target("SelectVariantsMAF5_all",
           inputs=['Clover_SP_dbSNP_V1.1.clean.all.vcf'],
           outputs=['Clover_SP_dbSNP_V1.1.MAF.all.vcf'],
           cores=1, memory='32g', walltime="08:00:00", account="NChain") << """
./SelectvariantsMAF5all.sh
"""
## SNP density
gwf.target("SNPdensity", inputs=['Clover_SP_dbSNP_V1.1.clean.all.vcf'],
           outputs = ['snp_density.csv'],
           cores=1, memory='90g', walltime="12:00:00", account="NChain") << """
./SNPdensity.sh 100
"""
## Site Frequency Spectrum
gwf.target("SFS", inputs=['Clover_SP_dbSNP_V1.1.clean.vcf'],
           outputs = ['sfs.csv'],
           cores=1, memory='8g', walltime="02:00:00", account="NChain") << """
./SFS.sh
"""
## Linkage disequilibrium
gwf.target("LD_vcftools", inputs=['Clover_SP_dbSNP_V1.1.MAF.vcf'],
           outputs = ['LD_10k_window.geno.ld'],
           cores=1, memory='4g', walltime="04:00:00", account="NChain") << """
./LD.sh
"""
gwf.target("LD_figures", inputs=['LD_10k_window.geno.ld'],
           outputs=['chr1.png','chr2.png','chr3.png','chr4.png','chr5.png',
                    'chr6.png','chr7.png','chr8.png','chr9.png','chr10.png',
                    'chr11.png','chr12.png','chr13.png','chr14.png','chr15.png',
                    'chr16.png'],
           cores=1, memory='4g', walltime="04:00:00", account="NChain") << """
./LDR.sh
"""
## Genetic Relationship Matrix
gwf.target("GRM", inputs=['Clover_SP_dbSNP_V1.1.MAF.vcf'],
           outputs=['GRM.csv'], memory='2g', account="NChain") << """
./GRM.sh
"""

Unifiedgenotyper

./Unifiedgenotyper.sh, Produces the unfiltered vcf file for only variant sites.

#!/bin/bash

source /com/extra/java/8/load.sh
source /com/extra/GATK/3.6/load.sh

java -Xmx64g -jar /com/extra/GATK/3.6/jar-bin/GenomeAnalysisTK.jar \
     -T UnifiedGenotyper \
     -I /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/WORKFLOW/bam.list \
     -o /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/WORKFLOW/Clover_SP_dbSNP_V1.1.final.vcf \
     -stand_call_conf 50 \
     -stand_emit_conf 20.0 \
     --sample_ploidy 2 \
     -nct 12 --genotyping_mode DISCOVERY \
     -R /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/Reference/TrR.v5.fasta \
     --output_mode EMIT_VARIANTS_ONLY \
     -rf BadCigar 2>&1 > log

Unifiedgenotyper all

./Unifiedgenotyperall.sh, Produces the unfiltered gvcf file which includes all confident sites.

#!/bin/bash

source /com/extra/java/8/load.sh
source /com/extra/GATK/3.6/load.sh

java -Xmx64g -jar /com/extra/GATK/3.6/jar-bin/GenomeAnalysisTK.jar \
     -T UnifiedGenotyper \
     -I /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/WORKFLOW/bam.list \
     -o /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/WORKFLOW/Clover_SP_dbSNP_V1.1.final.all.vcf \
     -stand_call_conf 50 \
     -stand_emit_conf 20.0 \
     --sample_ploidy 2 \
     -nct 12 --genotyping_mode DISCOVERY \
     -R /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/Reference/TrR.v5.fasta \
     --output_mode EMIT_ALL_CONFIDENT_SITES \
     -rf BadCigar 2>&1 > log

Selectvariants

./Selectvariants.sh, Filters all sites which have a Mapping quality less than 30, and a Depth less than 200 and Quality less than 20.

#!/bin/bash

source /com/extra/java/8/load.sh
source /com/extra/GATK/3.6/load.sh

java -Xmx64g -jar /com/extra/GATK/3.6/jar-bin/GenomeAnalysisTK.jar \
     -R /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/Reference/TrR.v5.fasta \
     -T SelectVariants \
     -o /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/WORKFLOW/Clover_SP_dbSNP_V1.1.clean.vcf \
     --variant /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/WORKFLOW/Clover_SP_dbSNP_V1.1.final.vcf \
     --restrictAllelesTo ALL -select "MQ>30.00 && DP>200 && QUAL>20.00"

Select variants Minor Allele Frequency

./SelectvariantsMAF5.sh, Uses the filtered vcf file and filters for minor allele frequency above 5%

#!/bin/bash

source /com/extra/java/8/load.sh
source /com/extra/GATK/3.6/load.sh

java -Xmx64g -jar /com/extra/GATK/3.6/jar-bin/GenomeAnalysisTK.jar \
     -R /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/Reference/TrR.v5.fasta \
     -T SelectVariants \
     -o /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/WORKFLOW/Clover_SP_dbSNP_V1.1.MAF.vcf \
     --variant /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/WORKFLOW/Clover_SP_dbSNP_V1.1.clean.vcf \
     --restrictAllelesTo BIALLELIC -select "AF>0.05"

Select variants All sites

./Selectvariantsall.sh, Filters all sites which have a Mapping quality less than 30, and a Depth less than 200 and Quality less than 20 in the gvcf file with all sites.

#!/bin/bash

source /com/extra/java/8/load.sh
source /com/extra/GATK/3.6/load.sh

java -Xmx64g -jar /com/extra/GATK/3.6/jar-bin/GenomeAnalysisTK.jar \
     -R /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/Reference/TrR.v5.fasta \
     -T SelectVariants \
     -o /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/WORKFLOW/Clover_SP_dbSNP_V1.1.clean.all.vcf \
     --variant /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/WORKFLOW/Clover_SP_dbSNP_V1.1.final.all.vcf \
     --restrictAllelesTo ALL -select "MQ>30.00 && DP>200 && QUAL>20.00"

Select variants Minor Allele Frequency All sites

./SelectvariantsMAF5all.sh, Uses the filtered vcf file and filters for minor allele frequency above 5% in the filtered gvcf file.

#!/bin/bash

source /com/extra/java/8/load.sh
source /com/extra/GATK/3.6/load.sh

java -Xmx64g -jar /com/extra/GATK/3.6/jar-bin/GenomeAnalysisTK.jar \
     -R /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/Reference/TrR.v5.fasta \
     -T SelectVariants \
     -o /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/WORKFLOW/Clover_SP_dbSNP_V1.1.MAF.all.vcf \
     --variant /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/WORKFLOW/Clover_SP_dbSNP_V1.1.clean.all.vcf \
     --restrictAllelesTo BIALLELIC -select "AF>0.05"

SNP density

SNP density calculations using the filtered genome vcf sites. Dividing the genome into bins and counting variants and total number of sites. The SNP density is then variants/total number of sites.

#!/bin/bash

source /com/extra/python/2.7/load.sh

python SNPdensity.py -i Clover_SP_dbSNP_V1.1.clean.all.vcf -o snp_density.csv -b $1 -d 200

SNPdensity.py script

from parseVCF import readVCF
from sys import argv, exit
from optparse import OptionParser
def collectReferenceLengths(header):
    contigs = header.split("contig")[1:]
    contigs = [contig.split(">")[0] for contig in contigs]
    lengths = {}
    
    for contig in contigs:
        ID, length = contig.split(",")
        ID = ID.split("=")[-1]
        length = length.split("=")[-1]
        
        lengths[ID] = int(length)
    
    return lengths
def makeBins(referenceSizes, binsize):
    bins = {}
    
    for chromosome, length in referenceSizes.items():
        bins[chromosome] = [{'sites':0, 'SNP':0, 'DP':0, 'AF':0}
                            for _ in range(length/binsize+1)]
    return bins
def countSNPS(data, bins, binsize, depth):
    for i in range(len(data['CHROM'])):
        chrom, pos, alt, dp, af = data['CHROM'][i], data['POS'][i], data['ALT'][i], data['INFO'][i]['DP'], data['INFO'][i].get('AF', 0)
        bpos = pos/binsize
        if dp<depth: continue
        bins[chrom][bpos]['sites'] += 1
        bins[chrom][bpos]['DP'] += dp
        if alt!=".": 
            bins[chrom][bpos]['SNP'] += 1
            try:
                bins[chrom][bpos]['AF'] += af
            except TypeError:
                print "AF was a string:", af
def SNPdensity(bins):
    SNPdens = {}
    for chromosome, counts in bins.items():
        SNPdens[chromosome] = []
        for count in counts:
            if count['sites']==0:
                SNPdens[chromosome].append('NA')
                continue
            SNPdens[chromosome].append(count['SNP']/float(count['sites']))
    return SNPdens
def printSNPdens(SNPdens, binsize):
    print "%20s %20s %20s" % ('Chromosome', 'Location', 'SNP density')
    
    for chromosome, densities in SNPdens.items():
        i = 0
        for density in densities:
            print "%20s %20s %20s" % (chromosome,
                                       str(i*binsize)+"-"+str((i+1)*binsize),
                                       str(density))
            i += 1
def writeCSV(filename, SNPdens, binsize, bins):
    f = open(filename, 'w')
    
    f.write("%s, %s, %s, %s, %s, %s, %s\n" % ('Chromosome', 'Location', 'SNP density', 'Sites', 'SNPs', 'DP', 'AF'))
    
    for chromosome, densities in SNPdens.items():
        for i, density in enumerate(densities):
            sites = bins[chromosome][i]['sites']
            SNP = bins[chromosome][i]['SNP']
            if sites!=0:
                DP = bins[chromosome][i]['DP']/sites
            else: DP = 0
            if SNP!=0:
                AF = bins[chromosome][i]['AF']/float(SNP)
            else: AF = 0
            f.write("%s, %s, %s, %s, %s, %s, %s\n" % (chromosome,
                                                      str(i*binsize)+"-"+str((i+1)*binsize),
                                                      str(density), str(sites),str(SNP),
                                                      str(DP), str(AF)))
    f.close()
def help():
    print "====== SNP density analysis of a gVCF file ====="
    print "Reads in a gvcf file, strips it down and counts"
    print "the occurrences of SNPS versus the number sites sequenced"
    print "-i <filename.vcf>                 The input vcf file"
    print "-o <filename.csv>                 The output csv file"
    print "-b binsize                        The binsize to be calculated from"
    print "-d read depth                     Filter sites less that the minimum read depth"
    print
    exit()
if __name__=="__main__":
    usage = "usage: %prog [options]"
    parser = OptionParser(usage)
    parser.add_option('-o', type="string", nargs=1, dest="output", help="<filename.csv>")
    parser.add_option('-i', type="string", nargs=1, dest="input", help="<filename.vcf>")
    parser.add_option('-b', type="int", nargs=1, dest="binsize", help="binsize in kb")
    parser.add_option('-d', type="int", nargs=1, dest="depth", help="read depth filter")
    parser.add_option('-H', action="store_true", dest="help", help="Displays help screen")
    options, args = parser.parse_args()
    if options.help!=None:
        help()
    if options.input!=None:
        infile = options.input
    else:
        raise "No input file, please include a vcf file -i <filename.vcf>"
    if options.output!=None:
        outfile = options.output
    else:
        outfile = "output.csv"
    if options.binsize!=None:
        binsize = options.binsize*1000
    else:
        raise "Please provide a binsize: -b binsize"
    if options.depth!=None:
        depth = options.depth
    else:
        print "Warning: Was not provided with a minimum depth filter. Therefore it will not filter any sites"
        print "Please use: -d minimum depth, to filter any sites with a low coverage"
        depth = 0
    print "READING DATA:"
    data, header = readVCF(infile, 9)
    print "RETRIEVING REFERENCE LENGTHS"
    reference = collectReferenceLengths(header)
    print "MAKING BINS"
    bins = makeBins(reference, binsize)
    print "COUNTING SNPS"
    countSNPS(data, bins, binsize, depth)
    print "CALCULATING SNP DENSITIES"
    SNPdens = SNPdensity(bins)
    print "WRITING TO CSV"
    writeCSV(outfile, SNPdens, binsize, bins)

Help text from the SNPdensity.py script.

====== SNP density analysis of a gVCF file =====
Reads in a gvcf file, strips it down and counts
the occurrences of SNPS versus the number sites sequenced
-i <filename.vcf>                 The input vcf file
-o <filename.csv>                 The output csv file
-b binsize                        The binsize to be calculated from
-d read depth                     Filter sites less that the minimum read depth

Site Frequency Spectrum

#!/bin/bash

source /com/extra/python/2.7/load.sh

python SFS.py -i Clover_SP_dbSNP_V1.1.clean.vcf -o sfs.csv -d 200 -s 197

SFS.py script

from sys import argv
from parseVCF import readVCF
from optparse import OptionParser
def makeSFS(data, depth, samples):
    sfs = [0 for i in range(0, samples*2)]
    for info in data:
        if info['DP']<depth: continue
        if info['AN']<samples: continue
        try:
            sfs[int(info['AC'])] += 1
        except:
            pass
    return sfs
def writeSFS(SFS, outfile):
    f = open(outfile, "w")
    f.write("AN, Count\n")
    for i, c in enumerate(SFS):
        f.write("%i, %i\n" % (i, c))
    f.close()
def help():
    print "====== S Site Frequency spectrum alysis of vcf file ====="
    print "Reads in a vcf file, strips it down and counts"
    print "the occurrences of SNPS versus the number sites sequenced"
    print "-i <filename.vcf>                 The input vcf file"
    print "-o <filename.csv>                 The output csv file"
    print "-d depth                          Read depth to filter by"
    print "-s samples                        Number of samples"
if __name__=="__main__":
    usage = "usage: %prog [options]"
    parser = OptionParser(usage)
    parser.add_option('-o', type="string", nargs=1, dest="output", help="<filename.csv>")
    parser.add_option('-i', type="string", nargs=1, dest="input", help="<filename.vcf>")
    parser.add_option('-d', type="int", nargs=1, dest="depth", help="read depth filter")
    parser.add_option('-s', type="int", nargs=1, dest="samples", help="number of samples")
    parser.add_option('-H', action="store_true", dest="help", help="Displays help screen")
    options, args = parser.parse_args()
    if options.help!=None:
        help()
    if options.input!=None:
        infile = options.input
    else:
        raise "No input file, please include a vcf file -i <filename.vcf>"
    if options.output!=None:
        outfile = options.output
    else:
        outfile = "output.csv"
    if options.depth!=None:
        depth = options.depth
    else:
        print "Warning: Was not provided with a minimum depth filter. Therefore it will not filter any sites"
        print "Please use: -d minimum depth, to filter any sites with a low coverage"
        depth = 0
    if options.samples!=None:
        samples = options.samples
    else:
        raise "Sample size unknown. Please use: -s samples, to filter sites by the number of samples."
    INFO = readVCF(infile, 9)[0]['INFO']
    sfs = makeSFS(INFO, depth, samples)
    writeSFS(sfs, outfile)

Parse VCF

parseVCF.py script, which is used by both the SNP density and SFS scripts. Reads and loads a vcf file.

from sys import argv
def simplify_name(name):
    if "/" in name:
        return name.split("/")[-1].split(".")[0]
    return name
def parseINFO(info):
    stats = {}
    for statistic in info.split(";")[:6]:
        name, value = statistic.split("=")
        try:
            stats[name] = float(value)
        except ValueError:
            stats[name] = value
    return stats
def readVCF(filename, nc=6):
    ''' 
    Reads in a VCF file while discarding most of the data.
    Only reads in, Chromosome (CHROM), position (POS), reference nucleotide (REF),
    alternative nucleotide (ALT), and mapping quality (QUAL). 
    '''
    f = open(filename)
    header = ""
    names = []
    data = {}
    options = {'CHROM': lambda x: data['CHROM'].append(x),
               'POS': lambda x: data['POS'].append(int(x)),
               'ID': lambda x: 1,
               'REF': lambda x: data['REF'].append(x),
               'ALT': lambda x: data['ALT'].append(x),
               'QUAL': lambda x: data['QUAL'].append(float(x)),
               'INFO': lambda x: data['INFO'].append(parseINFO(x)),
               'FORMAT': lambda x: 1,
               'FILTER': lambda x: 1
    }
    for line in f:
        if line[:2]=="##":
            header += line
            continue
        if line[:1]=="#":
            names = line.split("\t")[:nc]
            names = [simplify_name(name) for name in names]
            names[0] = names[0][1:]
            for name in names:
                data[name] = []
            print names
            continue
        for i, point in enumerate(line.split("\t")[:nc]):
            name = names[i]
            if name not in options:
                data[name].append(point)
            else: 
                options[name](point)
    del data['ID']
    
    return data, header
if __name__=="__main__":
    data, header = readVCF(argv[1], int(argv[2]))
    print data

LD analysis

LD analysis, using vcftools with a window size of 10000 bp

#!/bin/bash

source /com/extra/vcftools/0.1.14/load.sh

/com/extra/vcftools/0.1.14/bin/vcftools --vcf Clover_SP_dbSNP_V1.1.MAF.vcf --geno-r2 --ld-window-bp 10000 --out LD_10k_window

./LDR.sh Script loading the Rscript which produces the figures.

#!/bin/bash

source /com/extra/R/3.3/load.sh

Rscript LD.R

LD R script.

library(ggplot2)
library(dplyr)

LD <- read.table("LD_10k_window.geno.ld", header=TRUE, sep="\t")
#LD <- read.table("TestData.txt", header=TRUE, sep="\t")

#LD$Distance <- abs(LD$POS1-LD$POS2)

LD %>% filter(!is.nan(R.2) & R.2>0 & N_INDV>180) %>%
    mutate(Distance = abs(POS1-POS2)) -> LD

chromosomes <- sort(as.vector(unique(LD$CHR)))

for(chromosome in chromosomes[-1]){
    chrsub <- LD[LD$CHR==chromosome,]
    expmodel <- lm(log(R.2) ~ Distance, data=chrsub)
    r2 <- summary(expmodel)$r.squared
    xseq <- seq(0, 10000, by=0.01)
    reg_fit <- exp(predict(expmodel, list(Distance=xseq)))
    names(reg_fit) <- NULL
    expfit <- data.frame(Distance=xseq, R.2=reg_fit)
    cat("Length of fit:", length(reg_fit), "Length of sequence", length(xseq))
    fig <- ggplot(chrsub, aes(x=Distance, y=R.2)) + geom_point(size=0.3) +
        theme_bw() + labs(title=chromosome, x = "Distance", y = "R-squared linkage") +
        theme(text = element_text(size=8)) +
        annotate("text", label=paste0("R^2 == ", r2), x=5000, y=0.90, parse=TRUE) +
        geom_line(data=expfit, aes(x=Distance, y=R.2), colour="red") +
        scale_x_log10(limits=c(1, 10000))
    ggsave(plot=fig, file=paste0(chromosome, ".png"), width=10, height=4, unit="in")
}

Genetic Relationship Matrix

#!/bin/bash

source /com/extra/Anaconda-Python/2.2.0-2.7/load.sh
source activate cloverevolution

python2.7 VCFtoGRM.py -i Clover_SP_dbSNP_V1.1.MAF.vcf -o GRM.csv -n 20

GRM script

from sys import argv
import numpy as np
import scipy.stats as stats
from optparse import OptionParser
def collectName(string):
    return string.split("/")[-1].split(".")[0]
def getMarker(allele):
    markers = {"0/0": 1.0, "0/1": 0.0, "1/1": -1.0, "./.": None}
    return markers.get(allele.split(":")[0], None)
def readVCFmatrix(filename, n, numpy=True):
    f = open(filename)
    header = ""
    names = []
    matrix = []
    for line in f:
        if line[:2]=="##":
            header += line
            continue
        if line[:1]=="#":
            names = [collectName(name) for name in line.split("\t")[9:]]
            print "Sequence names:", names
            matrix = [[] for i in range(len(names))]
            N = len(names)
            continue
        allele_list = []
        for allele in line.split("\t")[9:]:
            allele_list.append(getMarker(allele))
        if len(filter(lambda x: x!=None, allele_list))>=(N-n):
            for i, allele in enumerate(allele_list):
                if allele!=None:
                    matrix[i].append(allele)
                else:
                    matrix[i].append(0.0)
    if numpy==True:
        matrix = np.array(matrix)
    return matrix, names, header
def replace_column_nans_by_mean(matrix):
    # Set the value of gaps/dashes in each column to be the average of the other values in the column.
    nan_indices = np.where(np.isnan(matrix))
    # Note: bn.nanmean() instead of np.nanmean() because it is a lot(!) faster.
    column_nanmeans = np.nanmean(matrix, axis=0)
    matrix[nan_indices] = np.take(column_nanmeans, nan_indices[1])
    return matrix
def calcGRM(SNP):
    N, M = SNP.shape
    NORM = (SNP-np.mean(SNP, 0))/(np.std(SNP, 0)+0.000000000000001)
    return np.dot(NORM, NORM.T)/M
def help():
    print "====== VCF to GRM (Genetic Relationship Matrix) ====="
    print "Reads in a vcf file as an individual by Marker matrix"
    print "Calculates a GRM using this Marker matrix."
    print "-i <filename.vcf>                 The input vcf file"
    print "-o <filename.csv>                 The output GRM as a csv file"
    print "-n number of missing              How many missing alleles are allowed"
if __name__=="__main__":
    usage = "usage: %prog [options]"
    parser = OptionParser(usage)
    parser.add_option('-o', type="string", nargs=1, dest="output", help="<filename.csv>")
    parser.add_option('-i', type="string", nargs=1, dest="input", help="<filename.vcf>")
    parser.add_option('-n', type="int", nargs=1, dest="missing", help="number of missing allowed")
    parser.add_option('-H', action="store_true", dest="help", help="Displays help screen")
    options, args = parser.parse_args()
    if options.help!=None:
        help()
    if options.input!=None:
        infile = options.input
    else:
        raise "No input file, please include a vcf file -i <filename.vcf>"
    if options.output!=None:
        outfile = options.output
    else:
        outfile = "GRM.csv"
    if options.missing!=None:
        nmissing = int(options.missing)
    else:
        nmissing = 0
    print "READING VCF FILE"
    matrix, names, header = readVCFmatrix(infile, nmissing)
    print "STARTING CALCULATIONS"
    print "Number of individuals", matrix.shape[0]
    M = matrix.shape[1]
    print "Number of Markers", M
    matrix = replace_column_nans_by_mean(matrix)
    GRM = calcGRM(matrix)
    print "SAVING FILE"
    np.savetxt(outfile, GRM, delimiter=", ")

GBS data analysis

This section contains all of the R scripts which visualize the results from the GBS data analysis. The results can be seen in the CloverGBS.html file. ##SNP density SNPdensity.R, script which displays the SNP density in all of the clover genome.

library(ggplot2)
library(grid)
library(gridExtra)
library(dplyr)

snp_density <- read.csv("snp_density.csv")
snp_density$SNP.density <- as.numeric(levels(snp_density$SNP.density))[snp_density$SNP.density]

convert_ranges <- function(x) {
    values <- as.numeric(unlist(strsplit(x, "-")))
    mean(values)
}

convert_ranges <- Vectorize(convert_ranges)

binsize <- 100000

snp_density$Location <- convert_ranges(levels(snp_density$Location)[snp_density$Location])
snp_density$Coverage <- snp_density$Sites/binsize

chromosomes <- sort(as.vector(unique(snp_density[,1])))

cat("")

snp_density %>% filter(Coverage>0.01) -> snp_density

max_location <- max(snp_density$Location)

cat("Number of Sites:", sum(snp_density$Sites))
cat("Number of SNPs:", sum(snp_density$SNPs))
cat("Mean SNP density:", mean(snp_density$SNP.density))
cat("Minimum SNP density:", min(snp_density$SNP.density))
cat("Maximum SNP density:", max(snp_density$SNP.density))
cat("Variance of SNP density:", var(snp_density$SNP.density))

figures <- list()
for(chromosome in chromosomes[-1]){
    chrsub <- snp_density[snp_density$Chromosome==chromosome,]
    fig <- ggplot(chrsub, aes(x=Location, y=SNP.density)) + geom_bar(stat="identity") +
        theme_classic() + labs(title=chromosome) +
        theme(text = element_text(size=6)) +
        scale_y_continuous(limits=c(0, 0.2)) +
        scale_x_continuous(limits=c(0, max_location))
    figures[[chromosome]] <- fig
}

grid.arrange(grobs = figures, ncol=1)

Site Frequency Spectrum (SFS)

SFS.R, script which displays the Site Frequency Spectrum (SFS).

library(ggplot2)

spectrum <- read.csv("sfs.csv")
spectrum <- spectrum[2:nrow(spectrum),1:2]

cat("Number of sites:", sum(spectrum$Count))

cat("MAF sites:", sum(spectrum$Count[1:20]))

cat("After removing MAF:", sum(spectrum$Count[-1:-20]))

spectrum$Proportion <- spectrum$Count/sum(spectrum$Count)

#spectrum <- spectrum[1:(floor(nrow(spectrum)/2)-1), 2]+spectrum[nrow(spectrum):(floor(nrow(spectrum)/2)+2), 2]

expected <- (1/1:393)/sum(1/1:393)

spectrum$Proportion[1:(ceiling(393/2)+1)] <- spectrum$Proportion[1:(ceiling(393/2)+1)] + spectrum$Proportion[394:(ceiling(393/2))]

expected <- expected[1:(ceiling(393/2)+1)]+expected[394:(ceiling(393/2))]

spectrum <- spectrum[1:(ceiling(393/2)+1), 1:3]


ggplot(spectrum, aes(x=AN, y=Proportion)) +
    geom_bar(stat="identity", fill="cyan", color="black", size=0.2) +
    labs(y="Proportion", x="Allele Frequency") + theme_classic() +
    geom_line(data=NULL, aes(x=1:198, y=expected), size=0.7, alpha=0.7)


#qplot(as.vector(rep(1:198, spectrum)), geom="histogram", binwidth=1,
#      color=I("black"), fill=I("cyan")) + theme_classic() + labs(y="Count", x="Allele Frequency")

Genetic Relationship Matrix (GRM)

DispalyGRM.R, displays the GRM, and PCAs of the GRM, and removes the weird outliers manually.

library(ggplot2)
library(reshape2)
library(dplyr)

GRM <- as.matrix(read.csv("GRM.csv", header=FALSE))

Plants <- read.csv("White_Clover_Individuals_Library_Info.csv")

cat("Dim of GRM", dim(GRM))

seq_names <- c('1', '10', '100', '101', '102', '104', '105', '106', '107', '108', '109', '11', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '12', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '13', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '14', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '15', '150', '151', '152', '153', '154', '155', '156', '157', '158', '159', '16', '160', '161', '162', '163', '164', '165', '166', '167', '168', '169', '17', '170', '171', '172', '173', '174', '175', '176', '177', '178', '179', '18', '180', '181', '182', '183', '184', '185', '186', '187', '188', '189', '19', '190', '191', '192', '193', '194', '195', '196', '197', '198', '199', '2', '20', '200', '201', '21', '22', '23', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '64', '65', '66', '67', '69', '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '8', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '9', '90', '91', '92', '94', '95', '96', '97', '98', '99')

cat("Length of seq_names", length(seq_names))

rownames(GRM) <- seq_names
colnames(GRM) <- seq_names

GRM <- GRM[as.character(sort(as.integer(rownames(GRM)))),
           as.character(sort(as.integer(colnames(GRM))))]

collectFastaName <- Vectorize(function(x) as.integer(unlist(strsplit(as.character(x), "\\."))[1]))

plantnames <- collectFastaName(Plants$fastq.file)

indices <- vector()
for(n in rownames(GRM)){
    indices <- c(indices, which(plantnames==n))
}

all_names <- as.character(Plants$Variety[indices])
for(i in seq_along(all_names)){
    all_names[i] <- paste(unlist(strsplit(all_names[i], "\\_"))[2:3], collapse ="_")
}

rownames(GRM) <- all_names
colnames(GRM) <- all_names

GRM <- GRM[as.character(sort(rownames(GRM))),
           as.character(sort(colnames(GRM)))]

#rownames(GRM) <- 1:nrow(GRM)
#colnames(GRM) <- 1:nrow(GRM)

GRM[GRM>1] <- 1

GRMdata <- melt(GRM)

##heatmap(GRM)

ggplot(data=GRMdata, aes(x=Var1, y=Var2, fill=value)) + geom_tile() + theme_classic() +
    scale_fill_gradientn(colours=c("black", "purple", "pink", "white")) +
    labs(x = "Individual", y = "Individual") + theme(axis.text.x = element_text(size=4),
                                                     axis.text.y = element_text(size=4))

## PCA

colorvector <- c("#a76d00","#4168ff","#59e069","#420097","#acd44f",
                 "#ff71f1","#00931d","#de006b","#47dada","#fe0e16",
                 "#002552","#9a9900","#70006e","#2d6800","#980016",
                 "#003003","#f6b89c","#4e0726","#ab9173","#504200")

collectGroup <- Vectorize(function(x) unlist(strsplit(as.character(x), "\\_"))[1])

PCA <- princomp(GRM, scores=TRUE)

plot(PCA)

plotdata <- data.frame(PC1=PCA$scores[,1],
                       PC2=PCA$scores[,2],
                       Groups=collectGroup(names(PCA$scores[,1])))

find_hull <- function(df) df[chull(df$PC1, df$PC2), ]

hulls <- data.frame()
for(Group in unique(plotdata$Groups)){
    Subdf <- plotdata %>% filter(Groups==Group)
    hull <- Subdf[chull(Subdf$PC1, Subdf$PC2),]
    hulls <- rbind(hulls, hull)
}

ggplot(plotdata, aes(x=PC1, y=PC2, color=Groups)) + geom_point() + theme_bw() +
    geom_polygon(data = hulls, aes(x=PC1, y=PC2, fill=Groups), alpha = 0.1) +
    scale_color_manual(values=colorvector) +
    scale_fill_manual(values=colorvector) +
    labs(x="Principle component 1", y="Principle component 2")


plotdata <- data.frame(PC2=PCA$scores[,2],
                       PC3=PCA$scores[,3],
                       Groups=collectGroup(names(PCA$scores[,2])))

find_hull <- function(df) df[chull(df$PC2, df$PC3), ]

hulls <- data.frame()
for(Group in unique(plotdata$Groups)){
    Subdf <- plotdata %>% filter(Groups==Group)
    hull <- Subdf[chull(Subdf$PC2, Subdf$PC3),]
    hulls <- rbind(hulls, hull)
}

ggplot(plotdata, aes(x=PC2, y=PC3, color=Groups)) + geom_point() + theme_bw() +
    geom_polygon(data = hulls, aes(x=PC2, y=PC3, fill=Groups), alpha = 0.1) +
    scale_color_manual(values=colorvector) +
    scale_fill_manual(values=colorvector) +
    labs(x="Principle component 2", y="Principle component 3")


plotdata <- data.frame(PC1=PCA$scores[,1],
                       PC3=PCA$scores[,3],
                       Groups=collectGroup(names(PCA$scores[,1])))

find_hull <- function(df) df[chull(df$PC1, df$PC3), ]

hulls <- data.frame()
for(Group in unique(plotdata$Groups)){
    Subdf <- plotdata %>% filter(Groups==Group)
    hull <- Subdf[chull(Subdf$PC1, Subdf$PC3),]
    hulls <- rbind(hulls, hull)
}

ggplot(plotdata, aes(x=PC1, y=PC3, color=Groups)) + geom_point() + theme_bw() +
    geom_polygon(data = hulls, aes(x=PC1, y=PC3, fill=Groups), alpha = 0.1) +
    scale_color_manual(values=colorvector) +
    scale_fill_manual(values=colorvector) +
    labs(x="Principle component 1", y="Principle component 3")



## Fixing mismatches

## Manually fixing labels

GRM <- GRM[-which(rownames(GRM)=="Aberconcor_07"), -which(rownames(GRM)=="Aberconcor_07")]
GRM <- GRM[-which(rownames(GRM)=="Aberconcor_03"), -which(rownames(GRM)=="Aberconcor_03")]
GRM <- GRM[-which(rownames(GRM)=="Aberpearl_10"), -which(rownames(GRM)=="Aberpearl_10")]
GRM <- GRM[-which(rownames(GRM)=="Avalon_05"), -which(rownames(GRM)=="Avalon_05")]
GRM <- GRM[-which(rownames(GRM)=="Avalon_06"), -which(rownames(GRM)=="Avalon_06")]
GRM <- GRM[-which(rownames(GRM)=="Barblanca_01"), -which(rownames(GRM)=="Barblanca_01")]
GRM <- GRM[-which(rownames(GRM)=="Borek_08"), -which(rownames(GRM)=="Borek_08")]
GRM <- GRM[-which(rownames(GRM)=="Brianna_01"), -which(rownames(GRM)=="Brianna_01")]

rownames(GRM)[rownames(GRM)=="Coolfin_04"] <- "Riesling_11"
colnames(GRM)[colnames(GRM)=="Coolfin_04"] <- "Riesling_11"

GRM <- GRM[as.character(sort(rownames(GRM))),
           as.character(sort(colnames(GRM)))]

cat("Dim of GRM: ", dim(GRM))

GRMdata <- melt(GRM)

ggplot(data=GRMdata, aes(x=Var1, y=Var2, fill=value)) + geom_tile() + theme_classic() +
    scale_fill_gradientn(colours=c("black", "purple", "pink", "white")) +
    labs(x = "Individual", y = "Individual") + theme(axis.text.x = element_text(size=4),
                                                     axis.text.y = element_text(size=4))

collectGroup <- Vectorize(function(x) unlist(strsplit(as.character(x), "\\_"))[1])

PCA <- princomp(GRM, scores=TRUE)

plot(PCA)

plotdata <- data.frame(PC1=PCA$scores[,1],
                       PC2=PCA$scores[,2],
                       Groups=collectGroup(names(PCA$scores[,1])))

find_hull <- function(df) df[chull(df$PC1, df$PC2), ]

hulls <- data.frame()
for(Group in unique(plotdata$Groups)){
    Subdf <- plotdata %>% filter(Groups==Group)
    hull <- Subdf[chull(Subdf$PC1, Subdf$PC2),]
    hulls <- rbind(hulls, hull)
}

ggplot(plotdata, aes(x=PC1, y=PC2, color=Groups)) + geom_point() + theme_bw() +
    geom_polygon(data = hulls, aes(x=PC1, y=PC2, fill=Groups), alpha = 0.1) +
    scale_color_manual(values=colorvector) +
    scale_fill_manual(values=colorvector) +
    labs(x="Principle component 1", y="Principle component 2")


plotdata <- data.frame(PC2=PCA$scores[,2],
                       PC3=PCA$scores[,3],
                       Groups=collectGroup(names(PCA$scores[,2])))

find_hull <- function(df) df[chull(df$PC2, df$PC3), ]

hulls <- data.frame()
for(Group in unique(plotdata$Groups)){
    Subdf <- plotdata %>% filter(Groups==Group)
    hull <- Subdf[chull(Subdf$PC2, Subdf$PC3),]
    hulls <- rbind(hulls, hull)
}

ggplot(plotdata, aes(x=PC2, y=PC3, color=Groups)) + geom_point() + theme_bw() +
    geom_polygon(data = hulls, aes(x=PC2, y=PC3, fill=Groups), alpha = 0.1) +
    scale_color_manual(values=colorvector) +
    scale_fill_manual(values=colorvector) +
    labs(x="Principle component 2", y="Principle component 3")


plotdata <- data.frame(PC1=PCA$scores[,1],
                       PC3=PCA$scores[,3],
                       Groups=collectGroup(names(PCA$scores[,1])))

find_hull <- function(df) df[chull(df$PC1, df$PC3), ]

hulls <- data.frame()
for(Group in unique(plotdata$Groups)){
    Subdf <- plotdata %>% filter(Groups==Group)
    hull <- Subdf[chull(Subdf$PC1, Subdf$PC3),]
    hulls <- rbind(hulls, hull)
}

ggplot(plotdata, aes(x=PC1, y=PC3, color=Groups)) + geom_point() + theme_bw() +
    geom_polygon(data = hulls, aes(x=PC1, y=PC3, fill=Groups), alpha = 0.1) +
    scale_color_manual(values=colorvector) +
    scale_fill_manual(values=colorvector) +
    labs(x="Principle component 1", y="Principle component 3")

Heterozygosity

Modifcation of the SNP density, where all of the SNP densities have been divided by the 'length of species tree', or the sum of 1/1+1/2...1/(2n-1)

library(ggplot2)
library(grid)
library(gridExtra)
library(dplyr)

snp_density <- read.csv("snp_density.csv")
snp_density$SNP.density <- as.numeric(levels(snp_density$SNP.density))[snp_density$SNP.density]

convert_ranges <- function(x) {
    values <- as.numeric(unlist(strsplit(x, "-")))
    mean(values)
}

convert_ranges <- Vectorize(convert_ranges)

binsize <- 10000

snp_density$Location <- convert_ranges(levels(snp_density$Location)[snp_density$Location])
snp_density$Coverage <- snp_density$Sites/binsize

chromosomes <- sort(as.vector(unique(snp_density[,1])))

snp_density %>% filter(Coverage>0.05) -> snp_density

max_location <- max(snp_density$Location)

cat("Number of Sites:", sum(snp_density$Sites))
cat("Number of SNPs:", sum(snp_density$SNPs))

snp_density$Heterozygosity <- snp_density$SNP.density/(sum(1/1:399))

cat("Mean Heterozygosity:", mean(snp_density$Heterozygosity))

figures <- list()
for(chromosome in chromosomes[-1]){
    chrsub <- snp_density[snp_density$Chromosome==chromosome,]
    fig <- ggplot(chrsub, aes(x=Location, y=Heterozygosity)) + geom_bar(stat="identity") +
        theme_classic() + labs(title=chromosome) +
        theme(text = element_text(size=6)) +
        scale_y_continuous(limits=c(0, 0.05)) +
        scale_x_continuous(limits=c(0, max_location))
    figures[[chromosome]] <- fig
}

grid.arrange(grobs = figures, ncol=1)

PSMC Pipeline

The pipeline for Clover Full sequencing of 4 individuals used for the PSMC analysis. gwf workflow for the PSMC

from gwf import Workflow
gwf = Workflow()
def bwa_map(infiles, outfiles):
    options = {
        'inputs': infiles,
        'outputs': outfiles,
        'memory': '4g',
        'cores': '8',
        'walltime': '240:00:00',
        'account': 'NChain'
    }
    spec = "./map.sh "+" ".join(infiles+outfiles)
    return options, spec
gwf.target("NCL08") << bwa_map(["NCL-08/FCHGMFMBBXX-wHAXPI040514-50_L8_1.fq.gz",
                                "NCL-08/FCHGMFMBBXX-wHAXPI040514-50_L8_2.fq.gz",
                                "NCL-08/FCHGMFNBBXX-wHAXPI040514-50_L2_1.fq.gz",
                                "NCL-08/FCHGMFNBBXX-wHAXPI040514-50_L2_2.fq.gz"],
                               ["ncl-08L8.bam", "ncl-08L2.bam"])
gwf.target("NCL09") << bwa_map(["NCL-09/FCHGMFMBBXX-wHAXPI040513-52_L8_1.fq.gz",
                                "NCL-09/FCHGMFMBBXX-wHAXPI040513-52_L8_2.fq.gz",
                                "NCL-09/FCHGMFNBBXX-wHAXPI040513-52_L2_1.fq.gz",
                                "NCL-09/FCHGMFNBBXX-wHAXPI040513-52_L2_2.fq.gz"],
                               ["ncl-09L8.bam", "ncl-09L2.bam"])
gwf.target("NCL10") << bwa_map(["NCL-10/FCHGMFMBBXX-wHAXPI040512-53_L8_1.fq.gz",
                                "NCL-10/FCHGMFMBBXX-wHAXPI040512-53_L8_2.fq.gz",
                                "NCL-10/FCHGMFNBBXX-wHAXPI040512-53_L2_1.fq.gz",
                                "NCL-10/FCHGMFNBBXX-wHAXPI040512-53_L2_2.fq.gz"],
                               ["ncl-10L8.bam", "ncl-10L2.bam"])
gwf.target("NCL12") << bwa_map(["NCL-12/FCHGMFMBBXX-wHAXPI040511-54_L8_1.fq.gz",
                                "NCL-12/FCHGMFMBBXX-wHAXPI040511-54_L8_2.fq.gz",
                                "NCL-12/FCHGMFNBBXX-wHAXPI040511-54_L2_1.fq.gz",
                                "NCL-12/FCHGMFNBBXX-wHAXPI040511-54_L2_2.fq.gz"],
                               ["ncl-12L8.bam", "ncl-12L2.bam"])
def merge_reads(infiles, outfiles):
    options = {
        'inputs': infiles,
        'outputs': outfiles,
        'memory': '4g',
        'walltime': '144:00:00',
        'account': 'NChain'
    }
    spec = "./merge.sh "+" ".join(infiles+outfiles)
    return options, spec
gwf.target("Merge_NCL08") << merge_reads(["ncl-08L8.bam", "ncl-08L2.bam"],
                                         ["ncl-08.u.bam"])
gwf.target("Merge_NCL09") << merge_reads(["ncl-09L8.bam", "ncl-09L2.bam"],
                                         ["ncl-09.u.bam"])
gwf.target("Merge_NCL10") << merge_reads(["ncl-10L8.bam", "ncl-10L2.bam"],
                                         ["ncl-10.u.bam"])
gwf.target("Merge_NCL12") << merge_reads(["ncl-12L8.bam", "ncl-12L2.bam"],
                                         ["ncl-12.u.bam"])
def sort_reads(infiles, outfiles):
    options = {
        'inputs': infiles,
        'outputs': outfiles,
        'memory': '4g',
        'walltime': '144:00:00',
        'account': 'NChain'
    }
    spec = "./sort.sh "+" ".join(infiles+outfiles)
    return options, spec
gwf.target("Sort_NCL08") << sort_reads(["ncl-08.u.bam"],
                                       ["ncl-08.bam"])
gwf.target("Sort_NCL09") << sort_reads(["ncl-09.u.bam"],
                                       ["ncl-09.bam"])
gwf.target("Sort_NCL10") << sort_reads(["ncl-10.u.bam"],
                                       ["ncl-10.bam"])
gwf.target("Sort_NCL12") << sort_reads(["ncl-12.u.bam"],
                                       ["ncl-12.bam"])
def indexbam(infiles, outfiles):
    options = {
        'inputs': infiles,
        'outputs': outfiles,
        'memory': '1g',
        'walltime': '144:00:00',
        'account': 'NChain'
    }
    spec = "./indexbam.sh "+" ".join(infiles+outfiles)
    return options, spec
gwf.target("Index_NCL08") << indexbam(["ncl-08.bam"],
                                      ["ncl-08.bai"])
gwf.target("Index_NCL09") << indexbam(["ncl-09.bam"],
                                      ["ncl-09.bai"])
gwf.target("Index_NCL10") << indexbam(["ncl-10.bam"],
                                      ["ncl-10.bai"])
gwf.target("Index_NCL12") << indexbam(["ncl-12.bam"],
                                      ["ncl-12.bai"])
def bamToDiploid(infiles, outfiles):
    options = {
        'inputs': infiles,
        'outputs': outfiles,
        'memory': '1g',
        'walltime': '72:00:00',
        'account': 'NChain'
    }
    spec = "./bam2diploid.sh "+" ".join(infiles+outfiles)
    return options, spec
gwf.target("diploid_NCL08") << bamToDiploid(["ncl-08.bam"],
                                            ["ncl-08.fq.gz"])
gwf.target("diploid_NCL09") << bamToDiploid(["ncl-09.bam"],
                                            ["ncl-09.fq.gz"])
gwf.target("diploid_NCL10") << bamToDiploid(["ncl-10.bam"],
                                            ["ncl-10.fq.gz"])
gwf.target("diploid_NCL12") << bamToDiploid(["ncl-12.bam"],
                                            ["ncl-12.fq.gz"])
def PSMC(infiles, outfiles):
    options = {
        'inputs': infiles,
        'outputs': outfiles,
        'memory': '1g',
        'walltime': '24:00:00',
        'account': 'NChain'
    }
    spec = "./psmc.sh "+" ".join(infiles)
    
    return options, spec
gwf.target("psmc_NCL08") << PSMC(["ncl-08.fq.gz"],
                                 ["ncl-08.fq.psmc", "ncl-08.fq.psmcfa"])
gwf.target("psmc_NCL09") << PSMC(["ncl-09.fq.gz"],
                                 ["ncl-09.fq.psmc", "ncl-09.fq.psmcfa"])
gwf.target("psmc_NCL10") << PSMC(["ncl-10.fq.gz"],
                                 ["ncl-10.fq.psmc", "ncl-10.fq.psmcfa"])
gwf.target("psmc_NCL12") << PSMC(["ncl-12.fq.gz"],
                                 ["ncl-12.fq.psmc", "ncl-12.fq.psmcfa"])
def CombinePSMC(infiles, outfiles):
    options = {
        'inputs': infiles,
        'outputs': outfiles,
        'memory': '1g',
        'walltime': '1:00:00',
        'account': 'NChain'
    }
    spec = "cat "+" ".join(infiles)+">> "+outfiles[0]
    
    return options, spec
gwf.target("combined") << CombinePSMC(["ncl-08.fq.psmc",
                                       "ncl-09.fq.psmc",
                                       "ncl-10.fq.psmc",
                                       "ncl-12.fq.psmc"],
                                     ["combined.psmc"])
def PlotPSMC(infiles, outfiles):
    options = {
        'inputs': infiles,
        'outputs': outfiles,
        'memory': '1g',
        'walltime': '24:00:00',
        'account': 'NChain'
    }
    spec = "./plotpsmc.sh "+" ".join(infiles)
    return options, spec
gwf.target("plotPSMC") << PlotPSMC(["combined.psmc"],
                                   ["combined.eps"])

map.sh, script for mapping reads using BWA

#!/bin/bash

source /com/extra/bwa/0.7.5a/load.sh
source /com/extra/samtools/1.3/load.sh

bwa mem -t 8 -R '@RG\tID:1\tSM:1' /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/Reference/TrR.v5.fasta \
    $1 $2 | samtools view -Sb - > $5

bwa mem -t 8 -R '@RG\tID:1\tSM:1' /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/Reference/TrR.v5.fasta \
    $3 $4 | samtools view -Sb - > $6

merge.sh, script for merging two bam files together

#!/bin/bash

source /com/extra/samtools/1.3/load.sh

samtools merge $3 $1 $2

merge.sh, script for sorting a bam file

#!/bin/bash

source /com/extra/samtools/1.3/load.sh
source /com/extra/java/8/load.sh

samtools sort -o $2 $1

indexbam.sh, script for indexing a bam file

#!/bin/bash

source /com/extra/samtools/1.3/load.sh

samtools index $1

mv $1.bai $2

bam2diploid.sh, script for coverting a bam file into a diploid fasta file.

#!/bin/bash

source /com/extra/samtools/1.4.1/load.sh
source /com/extra/bcftools/1.4.1/load.sh

samtools mpileup -C 50 -uf /home/marnit/NChain/faststorage/WHITE_CLOVER/WCL_IND/Reference/TrR.v5.fasta $1 | bcftools call -c | vcfutils.pl vcf2fq -d 5 -D 100 - | gzip > ${1%.*}.fq.gz

psmc.sh, script for doing the psmc analysis.

#!/bin/bash

source /com/extra/psmc/2012-11-19/load.sh

fq2psmcfa $1 > ${1%.*}.psmcfa

psmc -N25 -t15 -r5 -p "4+25*2+4+6" -o ${1%.*}.psmc ${1%.*}.psmcfa

plotpsmc.sh, script for plotting the combined psmc results.

#!/bin/bash

source /com/extra/psmc/2012-11-19/load.sh

psmc_plot.pl -u 7e-9 -g 1 combined combined.psmc

MSMC pipeline

MSMC was used as an alternative look at the results from PSMC. The strenght of MSMC is that it allows the use of multiple individuals (haplotypes) in the analysis, to get better accuracy of the effective population size in recent times. The gwf pipeline for the analysis. The variables at the top determine in individuals used and corresponding samples.. This analysis used the same input as the PSMC analysis, with 4 whole genome sequenced clover individuals.

from gwf import Workflow
gwf = Workflow()
def bamCaller(reference, bamfile, vcf_file, mask, mean_depth, chrom):
    inputs = [reference, bamfile]
    outputs = [vcf_file, mask]
    options = {
        'cores': 1,
        'memory': '16g',
        'account': 'NChain',
        'walltime': '24:00:00'
    }
    spec = '''
    source /com/extra/samtools/1.4.1/load.sh
    source /com/extra/bcftools/1.4.1/load.sh
    samtools mpileup -q 20 -Q 20 -C 50 -u -r {chrom} -f {ref} {bam} | bcftools call -c -V indels |
    ./msmc-tools/bamCaller.py {mean_cov} {mask} | gzip -c > {out}
    '''.format(ref=reference, bam=bamfile, mean_cov=mean_depth, mask=mask, out=vcf_file, chrom=chrom)
    return inputs, outputs, options, spec
reference = "/faststorage/project/NChain/WHITE_CLOVER/BRAKER_ANNOTATION/pipeline/repens/TrR.v5.fasta"
individuals = ["ncl-08", "ncl-09", "ncl-10", "ncl-12"]
bamfiles = {"ncl-08": "../ncl-08_files/ncl-08.bam",
            "ncl-09": "../ncl-09_files/ncl-09.bam",
            "ncl-10": "../ncl-10_files/ncl-10.bam",
            "ncl-12": "../ncl-12_files/ncl-12.bam"}
depths = {"ncl-08": 47, "ncl-09": 45,
          "ncl-10": 44, "ncl-12": 55}
chromosomes = ["chr0", "chr1", "chr2", "chr3", "chr4", "chr5",
               "chr6", "chr7", "chr8", "chr9", "chr10", "chr11",
               "chr12", "chr13", "chr14", "chr15", "chr16"]
masks = {}
vcf_files = {}
masks_per_individual = {}
vcf_files_per_individual = {}
for individual in individuals:
    if individual not in masks_per_individual:
        masks_per_individual[individual] = []
        vcf_files_per_individual[individual] = []
    for chrom in chromosomes:
        if chrom not in masks:
            masks[chrom] = []
            vcf_files[chrom] = []
        gwf.target_from_template(chrom+"Bamcalling"+individual.split("-")[-1],
                                 bamCaller(reference, bamfiles[individual],
                                           "vcf_files/"+individual+"_"+chrom+".vcf.gz",
                                           "mask_files/"+individual+"_"+chrom+"_mask.bed.gz",
                                           depths[individual], chrom))
        masks[chrom].append("mask_files/"+individual+"_"+chrom+"_mask.bed.gz")
        vcf_files[chrom].append("vcf_files/"+individual+"_"+chrom+".vcf.gz")
        masks_per_individual[individual].append("mask_files/"+individual+"_"+chrom+"_mask.bed.gz")
        vcf_files_per_individual[individual].append("vcf_files/"+individual+"_"+chrom+".vcf.gz")
def generate_multihetsep(masks, vcf_files, output):
    inputs = masks+vcf_files
    outputs = [output]
    options = {
        'cores': 1,
        'memory': '16g',
        'account': 'NChain',
        'walltime': '24:00:00'
    }
    spec = "./msmc-tools/generate_multihetsep.py "
    for mask in masks:
        spec += "--mask={} ".format(mask)
    for vcf_file in vcf_files:
        spec += "{} ".format(vcf_file)
    spec += "> {}".format(output)
    return inputs, outputs, options, spec
multihetsep_haplo8 = []
for chrom in chromosomes:
    gwf.target_from_template("generate_multihetsep_{}_hap8".format(chrom),
                             generate_multihetsep(masks[chrom], vcf_files[chrom],
                                                  "8haplo/clover_{}.mhs".format(chrom)))
    multihetsep_haplo8.append("8haplo/clover_{}.mhs".format(chrom))
haplo6_ind = ["ncl-09", "ncl-10", "ncl-12"]
mhs_haplo6 = []
temp_masks = {}
temp_vcf = {}
for individual in haplo6_ind:
    for chrom, mask, vcf in zip(chromosomes, masks_per_individual[individual], vcf_files_per_individual[individual]):
        if chrom not in temp_masks:
            temp_masks[chrom] = []
            temp_vcf[chrom] = []
        temp_masks[chrom].append(mask)
        temp_vcf[chrom].append(vcf)
for chrom in chromosomes:
    gwf.target_from_template("generate_multihetsep_{}_hap6".format(chrom),
                             generate_multihetsep(temp_masks[chrom], temp_vcf[chrom],
                                                  "6haplo/clover_{}.mhs".format(chrom)))
    mhs_haplo6.append("6haplo/clover_{}.mhs".format(chrom))
haplo4 = {"1": ["ncl-09", "ncl-10"], "2": ["ncl-10", "ncl-12"],
          "3" : ["ncl-09", "ncl-12"]}
mhs_haplo4 = {}
for case in haplo4:
    temp_masks = {}
    temp_vcf = {}
    mhs_haplo4[case] = []
    for individual in haplo4[case]:
        for chrom, mask, vcf in zip(chromosomes, masks_per_individual[individual], vcf_files_per_individual[individual]):
            if chrom not in temp_masks:
                temp_masks[chrom] = []
                temp_vcf[chrom] = []
            temp_masks[chrom].append(mask)
            temp_vcf[chrom].append(vcf)
    for chrom in chromosomes:
        gwf.target_from_template("generate_multihetsep_{}_hap4_{}".format(chrom, case),
                                 generate_multihetsep(temp_masks[chrom], temp_vcf[chrom],
                                                      "4haplo/clover_{}_{}.mhs".format(chrom, case)))
        mhs_haplo4[case].append("4haplo/clover_{}_{}.mhs".format(chrom, case))
haplo2 = {"1": ["ncl-08"], "2": ["ncl-09"],
          "3" : ["ncl-10"], "4": ["ncl-12"]}
mhs_haplo2 = {}
for case in haplo2:
    temp_masks = {}
    temp_vcf = {}
    mhs_haplo2[case] = []
    for individual in haplo2[case]:
        for chrom, mask, vcf in zip(chromosomes, masks_per_individual[individual], vcf_files_per_individual[individual]):
            if chrom not in temp_masks:
                temp_masks[chrom] = []
                temp_vcf[chrom] = []
            temp_masks[chrom].append(mask)
            temp_vcf[chrom].append(vcf)
    for chrom in chromosomes:
        gwf.target_from_template("generate_multihetsep_{}_hap2_{}".format(chrom, case),
                                 generate_multihetsep(temp_masks[chrom], temp_vcf[chrom],
                                                      "2haplo/clover_{}_{}.mhs".format(chrom, case)))
        mhs_haplo2[case].append("2haplo/clover_{}_{}.mhs".format(chrom, case))
def msmc(multihetsep, output_name):
    inputs = multihetsep
    outputs = [output_name+".log", output_name+".loop.txt",
               output_name+".final.txt"]
    options = {
        'cores': 8,
        'memory': '68g',
        'account': 'NChain',
        'walltime': '24:00:00'
    }
    spec = "./msmc --fixedRecombination -r 0.15 -o {} -t {} ".format(output_name, options['cores'])
    for mhs in multihetsep:
        spec += "{} ".format(mhs)
    return inputs, outputs, options, spec
gwf.target_from_template("msmchaplo8",
                         msmc(multihetsep_haplo8, "clover_4_ind"))
gwf.target_from_template("msmchaplo6",
                         msmc(mhs_haplo6, "clover_3_ind"))
for case in haplo4:
    gwf.target_from_template("msmchaplo4_{}".format(case),
                             msmc(mhs_haplo4[case], "clover_2_ind_{}".format(case)))
for case in haplo2:
    gwf.target_from_template("msmchaplo2_{}".format(case),
                             msmc(mhs_haplo2[case], "clover_1_ind_{}".format(case)))

Scripts which scales the msmc output to the mutation rate and generation as descripted in the msmc guide.

from sys import argv, stdout
def read_table(filename, sep="\t"):
    f = open(filename)
    header = f.readline().replace("\n", "").split("\t")
    converter = {}
    table = {}
    for i, name in enumerate(header):
        converter[i] = name
        table[name] = []
    for line in f:
        elements = line.replace("\n", "").split(sep)
        for i, element in enumerate(elements):
            table[converter[i]].append(element)
    return table
def write_table(table, columns):
    stdout.write("\t".join(columns)+"\n")
    n = len(table[columns[0]])
    for i in range(n):
        row = []
        for col in columns:
            row.append(str(table[col][i]))
        stdout.write("\t".join(row)+"\n")
def convert_to_popsize(lambda00, mu):
    return (1/float(lambda00))/(2*mu)
def scale_time(time, mu, gentime):
    return (float(time)/mu)*gentime
if __name__=="__main__":
    table = read_table(argv[1])
    mu = float(argv[2])
    gentime = int(argv[3])
    table["scaledPopSize"] = map(lambda x: convert_to_popsize(x, mu), table["lambda_00"])
    table["scaledlefttime"] = map(lambda x: scale_time(x, mu, gentime), table["left_time_boundary"])
    table["scaledrighttime"] = map(lambda x: scale_time(x, mu, gentime), table["right_time_boundary"])
    write_table(table, ["time_index", "scaledlefttime", "scaledrighttime", "scaledPopSize"])

Simulations

The simulations of the different hypothesis were done using a python package called msprime. The simulation script

import msprime
from optparse import OptionParser
def help():
    print "====== Simulation through msprime with a simple growing population ====="
    print ""
    print "-o <filename.vcf>                           Output vcf file"
    print "-N int                                      Effective population size"
    print "-m float                                    Mutation rate"
    print "-r float                                    Recombination rate"
    print "-l int                                      Genome length"
    print "-g float                                    Growth rate"
    print "-s float                                    Sample size"
    print "-t int                                      Time"
    print "--print                                     Printing debug info"
    exit()
if __name__=="__main__":
    usage = "usage: %prog [options]"
    parser = OptionParser(usage)
    parser.add_option('-o', type="string", nargs=1, dest="output", help="<filename.vcf>")
    parser.add_option('-N', type="float", nargs=1, dest="Popsize", help="Effective population size")
    parser.add_option('-m', type="float", nargs=1, dest="mutrate", help="Mutation rate")
    parser.add_option('-r', type="float", nargs=1, dest="recomrate", help="Recombination rate")
    parser.add_option('-l', type="float", nargs=1, dest="length", help="Length")
    parser.add_option('-g', type="float", nargs=1, dest="growth", help="growth rate")
    parser.add_option('-s', type="int", nargs=1, dest="samples", help="Sample size")
    parser.add_option('-t', type="float", nargs=1, dest="time", help="Time")
    parser.add_option('-d', type="int", nargs=1, dest="timeoff", help="Time of population decreasion")
    parser.add_option('-b', type="int", nargs=1, dest="bottleneck", help="Bottleneck size")
    parser.add_option('--print', action="store_true", dest="prints", help="Print debug")
    parser.add_option('--start', action="store_true", dest="start", help="Trigger Population start with bottleneck")
    parser.add_option('-H', action="store_true", dest="help", help="Displays help screen")
    options, args = parser.parse_args()
    if options.help!=None:
        help()
    if options.Popsize!=None:
        Ne = options.Popsize
    else:
        Ne = 10000
    if options.mutrate!=None:
        mu = options.mutrate
    else:
        print "No mutation rate given. This will produce no variants."
        print "use -m to add a mutation rate"
        mu = None
    if options.recomrate!=None:
        r = options.recomrate
    else:
        r = None
    if options.length!=None:
        L = options.length
    else:
        L = None
    if options.growth!=None:
        g = options.growth
    else:
        g = 0
    if options.samples!=None:
        ss = options.samples
    else:
        ss = 1
    if options.time!=None:
        t = options.time
    else:
        t = 1
    if options.timeoff!=None:
        d = options.timeoff
    else:
        d = None
    if options.bottleneck!=None:
        bottleneck_size = options.bottleneck
    else:
        bottleneck_size = None
    population_configurations = [msprime.PopulationConfiguration(sample_size=ss,
                                                                 initial_size=Ne,
                                                                 growth_rate=0)]
    if d==None and bottleneck_size==None:
        demographic_events = [msprime.PopulationParametersChange(t, growth_rate=0)]
    elif d!=None and bottleneck_size==None:
        demographic_events = [msprime.PopulationParametersChange(t-d, growth_rate=g),
                              msprime.PopulationParametersChange(t, growth_rate=0)]
    elif d==None and bottleneck_size!=None:
        if options.start!=None:
            demographic_events = [msprime.PopulationParametersChange(t, initial_size=bottleneck_size)]
        else:
            demographic_events = [msprime.PopulationParametersChange(t, initial_size=bottleneck_size),
                                  msprime.PopulationParametersChange(t+1, initial_size=Ne)]
    else:
        raise """
Warning, using -d and -b together are not implemented. For instanstaneous bottlenecks use -b.
Otherwise use -d and -g to get the bottleneck size you want."""
    if options.prints!=None:
        dd = msprime.DemographyDebugger(Ne=Ne,
                                        population_configurations=population_configurations,
                                        demographic_events=demographic_events)
        dd.print_history()
    TS = msprime.simulate(Ne=Ne, length=L, recombination_rate=r, mutation_rate=mu,
                          population_configurations=population_configurations,
                          demographic_events=demographic_events)
    if options.output!=None:
        out = options.output
    else:
        out = "output.vcf"
    with open(out, "w") as vcf_file:
        TS.write_vcf(vcf_file, 2)

The help message

Usage: simulate.py [options]

Options:
  -h, --help     show this help message and exit
  -o OUTPUT      <filename.vcf>
  -N POPSIZE     Effective population size
  -m MUTRATE     Mutation rate
  -r RECOMRATE   Recombination rate
  -l LENGTH      Length
  -g GROWTH      growth rate
  -s SAMPLES     Sample size
  -t TIME        Time
  -d TIMEOFF     Time of population decreasion
  -b BOTTLENECK  Bottleneck size
  --print        Print debug
  --start        Trigger Population start with bottleneck

Divergent genes pipeline

Divergence analysis pipeline, running a given set of divergent genes and a randomly sampled set of genes gwf workflow for the pipeline

from gwf import Workflow
gwf = Workflow()
#######################################
## Read candidate genes
#######################################
def readgenelist(filename):
    return open(filename).read().split("\n")
def cleangenelist(genelist, title):
    nlist = []
    for gene in genelist:
        if gene=="": continue
        gene = gene.split("To")[1]
        nlist.append(gene)
    return nlist
genelist = readgenelist("Divergentgenes.txt")
#occi_gl = cleangenelist(genelist, "occidentale")
occi_gl = genelist
#############################################
## GMAP database
#############################################
def split_subgenomes(reference, subgenome1, subgenome2):
    """ """
    inputs = [reference]
    outputs = [subgenome1, subgenome2]
    options = {
        'cores': 1,
        'memory': '4g',
        'account': 'NChain',
        'walltime': '01:00:00'
    }
    directory = reference.split("/")[0]
    spec = '''
    source activate python2
    python splitreference.py {} {} {} '{}'
    '''.format(reference, subgenome1, subgenome2, ">chr9")
    return inputs, outputs, options, spec
gwf.target_from_template("RepensSplit",
                         split_subgenomes("repens/TrR.v5.fasta",
                                          "repens/TrR.v5.To.fasta",
                                          "repens/TrR.v5.Tp.fasta"))
#############################################
## Making blast databases?
#############################################
#############################################
## genblast
#############################################
def translateSequences(infile, outfile):
    inputs = [infile]
    outputs = [outfile]
    options = {
        'cores': 1,
        'memory': '4g',
        'account': 'NChain',
        'walltime': '04:00:00'
    }
    spec = '''
    source activate python2
    python translatefasta.py {} {}
    '''.format(infile, outfile)
    return inputs, outputs, options, spec
def genblast(query_genes, database, outfile, basename=""):
    """ """
    inputs = [query_genes, database]
    outputs = [outfile]
    options = {
        'cores': 1,
        'memory': '4g',
        'account': 'NChain',
        'walltime': '04:00:00'
    }
    if basename=="":
        basename = outfile.split("/")[-1].split(".")[0]
    spec = '''
    source activate python2
    cd genBlast
    ./genblast -p genblastg -q ../{query} -t ../{db} -c 1 -r 1 -gff -cdna -o {out}
    python ../cleanfasta.py {out}_1.1c_2.3_s1_0_16_1.DNA > {out}.cleaned.DNA
    python ../FASTafilter.py '\-R1\-' {out}.cleaned.DNA > ../{output}
    '''.format(query=query_genes, db=database, out=basename, output=outfile)
    return inputs, outputs, options, spec
gwf.target_from_template("RepensTogenBlast",
                         genblast("occidentale/divergentgenes.fasta", "repens/TrR.v5.To.fasta", "genblastresults/Togenes.fasta"))
gwf.target_from_template("PalgenBlast",
                         genblast("occidentale/divergentgenes.fasta", "pallescens/final_pallescens.fa", "genblastresults/Palgenes.fasta"))
gwf.target_from_template("Paltranslate",
                         translateSequences("genblastresults/Palgenes.fasta", "genblastresults/Palgenes.prot.fasta"))
gwf.target_from_template("RepensTpgenBlast",
                         genblast("genblastresults/Palgenes.prot.fasta", "repens/TrR.v5.Tp.fasta", "genblastresults/Tpgenes.fasta"))
#############################################
## Generate isolate gene files in occidentale
#############################################
def filter_gene(inputfile, outputfile, searchterm):
    """ """
    inputs = [inputfile]
    outputs = [outputfile]
    options = {
        'cores': 1,
        'memory': '1g',
        'account': 'NChain',
        'walltime': '00:20:00'
    }
    spec = '''
    source activate python2
    python FASTafilter.py '{}' {} > {}
    '''.format(searchterm, inputfile, outputfile)
    return inputs, outputs, options, spec
for gene in occi_gl:
    gwf.target_from_template("FilterTo%s" % gene,
                             filter_gene("occidentale/clover.cds.fa",
                                        "genes/To{}.fa".format(gene),
                                         "_"+gene))
    gwf.target_from_template("FilterTp%s" % gene,
                             filter_gene("genblastresults/Palgenes.fasta",
                                        "genes/Tp{}.fa".format(gene),
                                         "_"+gene))
    gwf.target_from_template("FilterTrTo%s" % gene,
                             filter_gene("genblastresults/Togenes.fasta",
                                        "genes/TrTo{}.fa".format(gene),
                                         "_"+gene))
    gwf.target_from_template("FilterTrTp%s" % gene,
                             filter_gene("genblastresults/Tpgenes.fasta",
                                        "genes/TrTp{}.fa".format(gene),
                                         "_"+gene))
#########################################
## Multiple alignment of the genes
#########################################
def join_genes(sequences, outfile, reference_gene):
    inputs = sequences
    outputs = [outfile]
    options = {
        'cores': 1,
        'memory': '1g',
        'account': 'NChain',
        'walltime': '00:20:00'
    }
    spec = '''
    source activate biopython2
    python FASTafilter.py '{}' {} | python joingenes.py'''.format(reference_gene, "occidentale/clover.cds.fa")
    for sequence in sequences:
        spec += " {} ".format(sequence)
    spec += "> {}".format(outfile)
    return inputs, outputs, options, spec
def join_genes_and_translate(sequences, outfile, reference_gene):
    inputs = sequences
    outputs = [outfile]
    options = {
        'cores': 1,
        'memory': '1g',
        'account': 'NChain',
        'walltime': '00:20:00'
    }
    spec = '''
    source activate biopython2
    python FASTafilter.py '{}' {} | python joingenesandtranslate.py'''.format(reference_gene, "occidentale/clover.cds.fa")
    for sequence in sequences:
        spec += " {} ".format(sequence)
    spec += "> {}".format(outfile)
    return inputs, outputs, options, spec
def multiple_align(genes, alignment, settings):
    """ """
    inputs = [genes]
    outputs = [alignment]
    options = {
        'cores': 2,
        'memory': '2g',
        'account': 'NChain',
        'walltime': '02:00:00'
    }
    spec = '''
    source activate prank
    prank -d={} -o={} {}
    '''.format(genes, ".".join(alignment.split(".")[0:2]), settings)
    return inputs, outputs, options, spec
for gene in occi_gl:
    gwf.target_from_template('makeinput{}'.format(gene),
                             join_genes(['genes/To{}.fa'.format(gene),
                                         'genes/TrTo{}.fa'.format(gene),
                                         'genes/TrTp{}.fa'.format(gene),
                                         'genes/Tp{}.fa'.format(gene)],
                                        "joined_genes/{}.fasta".format(gene), gene))
for gene in occi_gl:
    gwf.target_from_template('alignment{}'.format(gene),
                             multiple_align("joined_genes/{}.fasta".format(gene),
                                            "aligned/{}.best.fas".format(gene), "-codon +F"))
for gene in occi_gl:
    gwf.target_from_template('pepmakeinput{}'.format(gene),
                             join_genes_and_translate(['genes/To{}.fa'.format(gene),
                                         'genes/TrTo{}.fa'.format(gene),
                                         'genes/TrTp{}.fa'.format(gene),
                                         'genes/Tp{}.fa'.format(gene)],
                                        "joined_proteins/{}.fasta".format(gene), gene))
for gene in occi_gl:
    gwf.target_from_template('pepalignment{}'.format(gene),
                             multiple_align("joined_proteins/{}.fasta".format(gene),
                                            "aligned_proteins/{}.best.fas".format(gene), "-protein"))
#########################################
## Calculate dN/dS between the genes
#########################################
def calculate_dNds(alignment, dnds, pdist, sites, dnds_data=""):
    inputs = [alignment]
    outputs = [dnds, pdist, sites, dnds_data]
    options = {
        'cores': 2,
        'memory': '2g',
        'account': 'NChain',
        'walltime': '02:00:00'
    }
    spec = '''
    source activate python2
    python dNdS.py {} {} {} {} {}
    '''.format(alignment, dnds, pdist, sites, dnds_data)
    return inputs, outputs, options, spec
for gene in occi_gl:
   gwf.target_from_template('dNdS{}'.format(gene),
                            calculate_dNds("aligned/{}.best.fas".format(gene),
                                           "summarydata/{}.dnds.csv".format(gene),
                                           "summarydata/{}.pdist.csv".format(gene),
                                           "summarydata/{}.sites.csv".format(gene),
                                           "summarydata/{}.dnds.info.csv".format(gene)))
def comparePeptides(alignment, pdist, sites):
    inputs = [alignment]
    outputs = [pdist, sites]
    options = {
        'cores': 2,
        'memory': '2g',
        'account': 'NChain',
        'walltime': '02:00:00'
    }
    spec = '''
    source activate python2
    python comparePeptides.py {} {} {}
    '''.format(alignment, pdist, sites)
    return inputs, outputs, options, spec
for gene in occi_gl:
   gwf.target_from_template('cmpPep{}'.format(gene),
                            comparePeptides("aligned_proteins/{}.best.fas".format(gene),
                                           "summarydata_protein/{}.pdist.csv".format(gene),
                                           "summarydata_protein/{}.sites.csv".format(gene)))
def summary_files(inputfiles, output):
    inputs = inputfiles
    outputs = [output]
    options = {
        'cores': 2,
        'memory': '2g',
        'account': 'NChain',
        'walltime': '02:00:00'
    }
    inputstring = ",".join(inputfiles)
    spec = '''
    source activate python2
    python Joinsummary.py "{}" {}
    '''.format(inputstring, output)
    return inputs, outputs, options, spec
blacklist = ["840g41210.1", "0g01140.1", "4488g00060.1", "1089g30080.1", ""]
dNdSfiles = []
for gene in occi_gl:
    if gene in blacklist: continue
    dNdSfiles.append("summarydata/{}.dnds.csv".format(gene))
pdistfiles = []
for gene in occi_gl:
    if gene in blacklist: continue
    pdistfiles.append("summarydata/{}.pdist.csv".format(gene))
sitefiles = []
for gene in occi_gl:
    if gene in blacklist: continue
    sitefiles.append("summarydata/{}.sites.csv".format(gene))
dNdSinfofiles = []
for gene in occi_gl:
    if gene in blacklist: continue
    dNdSinfofiles.append("summarydata/{}.dnds.info.csv".format(gene))
gwf.target_from_template("dNdSsummarybetween",
                         summary_files(dNdSfiles, "summaryfiles/divergent_genes.dnds.csv"))
gwf.target_from_template("pdistsummarybetween",
                         summary_files(pdistfiles, "summaryfiles/divergent_genes.pdist.csv"))
gwf.target_from_template("sitesummarybetween",
                         summary_files(sitefiles, "summaryfiles/divergent_genes.sites.csv"))
gwf.target_from_template("dNdSinfosummarybetween",
                         summary_files(dNdSinfofiles, "summaryfiles/divergent_genes.dnds.info.csv"))
pdistfiles = []
for gene in occi_gl:
    if gene in blacklist: continue
    pdistfiles.append("summarydata_protein/{}.pdist.csv".format(gene))
sitefiles = []
for gene in occi_gl:
    if gene in blacklist: continue
    sitefiles.append("summarydata_protein/{}.sites.csv".format(gene))
gwf.target_from_template("PEPpdistsummarybetween",
                         summary_files(pdistfiles, "summaryfiles/divergent_proteins.pdist.csv"))
gwf.target_from_template("PEPsitesummarybetween",
                         summary_files(sitefiles, "summaryfiles/divergent_proteins.sites.csv"))
############################################
## Sample random genes and run full pipeline
############################################
## LOAD random genes file: random_genes.txt
## Was created by:
## grep '>' occidentale/clover.cds.fa | python createRandomList.py random_genes.txt 30
# randomgenelist = readgenelist("random_genes.txt")
def SplitSampleFasta(filename, outdir, parts):
    inputs = [filename]
    outputs = []
    options = {
        'cores': 2,
        'memory': '2g',
        'account': 'NChain',
        'walltime': '02:00:00'
    }
    fdir = "/".join(filename.split("/")[0:-1])
    for i in range(1, parts+1):
        if fdir=="":
            outputs.append(outdir+"/"+filename+".p"+str(i))
        else:
            outputs.append(fdir+"/"+outdir+"/"+filename.split("/")[-1]+".p"+str(i))
    spec = '''
    python splitfasta.py {} {} {}
    '''.format(filename, outdir, parts)
    return inputs, outputs, options, spec
parts = 100
gwf.target_from_template("SplitSamplegenes",
                         SplitSampleFasta("occidentale/clover.protein.fa", "parts", parts))
for i in range(1, parts+1):
    gwf.target_from_template("RepensTogenBlastPart"+str(i),
                         genblast("occidentale/parts/clover.protein.fa.p"+str(i), "repens/TrR.v5.To.fasta",
                                  "genblastresults/parts/Togenes.fasta.p"+str(i), "../occidentale/parts/TrTo.p"+str(i)))
    gwf.target_from_template("PalgenBlastPart"+str(i),
                             genblast("occidentale/parts/clover.protein.fa.p"+str(i), "pallescens/final_pallescens.fa",
                                      "genblastresults/parts/Palgenes.fasta.p"+str(i), "../occidentale/parts/Tp.p"+str(i)))
    gwf.target_from_template("PaltranslatePart"+str(i),
                             translateSequences("genblastresults/parts/Palgenes.fasta.p"+str(i), "genblastresults/parts/Palgenes.prot.fasta.p"+str(i)))
    gwf.target_from_template("RepensTpgenBlastPart"+str(i),
                             genblast("genblastresults/parts/Palgenes.prot.fasta.p"+str(i), "repens/TrR.v5.Tp.fasta",
                                      "genblastresults/parts/Tpgenes.fasta.p"+str(i), "../genblastresults/parts/TrTp.p"))
def join_files(sequences, outfile):
    inputs = sequences
    outputs = [outfile]
    options = {
        'cores': 1,
        'memory': '1g',
        'account': 'NChain',
        'walltime': '00:20:00'
    }
    spec = "cat"
    for sequence in sequences:
        spec += " {} ".format(sequence)
    spec += "> {}".format(outfile)
    return inputs, outputs, options, spec
TrTogenes = []
for i in range(1, parts+1):
    TrTogenes.append("genblastresults/parts/Togenes.fasta.p"+str(i))
Palgenes = []
for i in range(1, parts+1):
    Palgenes.append("genblastresults/parts/Palgenes.fasta.p"+str(i))
TrTpgenes = []
for i in range(1, parts+1):
    TrTpgenes.append("genblastresults/parts/Tpgenes.fasta.p"+str(i))
gwf.target_from_template("AllRepensTo",
                         join_files(TrTogenes, "genblastresults/Togenes.all.fasta"))
gwf.target_from_template("AllRepensPal",
                         join_files(Palgenes, "genblastresults/Palgenes.all.fasta"))
gwf.target_from_template("AllRepensTp",
                         join_files(TrTpgenes, "genblastresults/Tpgenes.all.fasta"))
def efficient_workflow(Togenes, TrTogenes, Tpgenes, Palgenes, gene):
    inputs = [Togenes, TrTogenes, Tpgenes, Palgenes]
    outputs = []
    options = {
        'cores': 2,
        'memory': '2g',
        'account': 'NChain',
        'walltime': '02:00:00'
    }
    spec = '''
    source activate python2
    echo "Filtering the gene from all of the respective species"
    python FASTafilter.py '_{gene}' {Togenes} > /scratch/$SLURM_JOBID/To00.gene.fasta
    python FASTafilter.py '_{gene}' {TrTogenes} > /scratch/$SLURM_JOBID/TrTo.gene.fasta
    python FASTafilter.py '_{gene}' {Tpgenes} > /scratch/$SLURM_JOBID/TrTp.gene.fasta
    python FASTafilter.py '_{gene}' {Palgenes} > /scratch/$SLURM_JOBID/Tp00.gene.fasta
    echo "Joining the gene files"
    python joingenes.py /scratch/$SLURM_JOBID/To00.gene.fasta /scratch/$SLURM_JOBID/TrTo.gene.fasta /scratch/$SLURM_JOBID/TrTp.gene.fasta /scratch/$SLURM_JOBID/Tp00.gene.fasta > /scratch/$SLURM_JOBID/joinedgenes.fasta
    cat /scratch/$SLURM_JOBID/joinedgenes.fasta
    ## Join the genes and translate
    python joingenesandtranslate.py  /scratch/$SLURM_JOBID/To00.gene.fasta /scratch/$SLURM_JOBID/TrTo.gene.fasta /scratch/$SLURM_JOBID/TrTp.gene.fasta /scratch/$SLURM_JOBID/Tp00.gene.fasta > /scratch/$SLURM_JOBID/joinedgenes.prot.fasta
    cat /scratch/$SLURM_JOBID/joinedgenes.prot.fasta
    ## Change environment
    source activate prank
    ## Align genes
    echo "Aligning the genes"
    prank -d=/scratch/$SLURM_JOBID/joinedgenes.fasta -o=/scratch/$SLURM_JOBID/alignedgenes -codon +F
    prank -d=/scratch/$SLURM_JOBID/joinedgenes.prot.fasta -o=/scratch/$SLURM_JOBID/alignedgenes.prot -protein
    cat /scratch/$SLURM_JOBID/alignedgenes.best.fas
    cat /scratch/$SLURM_JOBID/alignedgenes.prot.best.fas
    source activate python2
    ## Calculate measure on the alignments
    python dNdS.py /scratch/$SLURM_JOBID/alignedgenes.best.fas /scratch/$SLURM_JOBID/dnds.csv /scratch/$SLURM_JOBID/pdist.csv /scratch/$SLURM_JOBID/sites.csv /scratch/$SLURM_JOBID/dnds.sites.csv
    python comparePeptides.py /scratch/$SLURM_JOBID/alignedgenes.prot.best.fas /scratch/$SLURM_JOBID/pdist.prot.csv /scratch/$SLURM_JOBID/sites.prot.csv
    cat /scratch/$SLURM_JOBID/dnds.csv
    cat /scratch/$SLURM_JOBID/pdist.csv
    cat /scratch/$SLURM_JOBID/sites.csv
    cat /scratch/$SLURM_JOBID/dnds.sites.csv
    cat /scratch/$SLURM_JOBID/pdist.prot.csv
    cat /scratch/$SLURM_JOBID/sites.prot.csv
    python printcsv.py /scratch/$SLURM_JOBID/dnds.csv "{gene}" all_summaryfiles/all.genes.dnds.csv
    python printcsv.py /scratch/$SLURM_JOBID/pdist.csv "{gene}" all_summaryfiles/all.genes.pdist.csv
    python printcsv.py /scratch/$SLURM_JOBID/sites.csv "{gene}" all_summaryfiles/all.genes.sites.csv
    python printcsv.py /scratch/$SLURM_JOBID/dnds.sites.csv "{gene}" all_summaryfiles/all.genes.dnds.sites.csv
    python printcsv.py /scratch/$SLURM_JOBID/pdist.prot.csv "{gene}" all_summaryfiles/all.genes.pdist.prot.csv
    python printcsv.py /scratch/$SLURM_JOBID/sites.prot.csv "{gene}" all_summaryfiles/all.genes.sites.prot.csv
    '''.format(Togenes=Togenes, TrTogenes=TrTogenes, Tpgenes=Tpgenes, Palgenes=Palgenes, gene=gene)
    return inputs, outputs, options, spec
fullgenelist = readgenelist("allgenes.txt")
#fullgenelist = occi_gl
#fullgenelist = ["56g36650.1", "209g75490.3", "172g65010.1", "912g20040.2", "726g61350.1", "558g23290.1"]
for gene in fullgenelist:
    gwf.target_from_template("Gene"+gene,
                             efficient_workflow("occidentale/clover.cds.fa",
                                                "genblastresults/Togenes.all.fasta",
                                                "genblastresults/Tpgenes.all.fasta",
                                                "genblastresults/Palgenes.all.fasta", gene))

Splitting the white clover reference into the two subgenomes

from sys import argv
def make_new_reference_files(filename, sub1, sub2, divider=">chr9"):
    genomes = open(filename).read().split(divider)
    f = open(sub1, "w")
    f.write(genomes[0])
    f.close()
    f = open(sub2, "w")
    f.write(">chr9"+genomes[1])
    f.close()
if __name__=="__main__":
    make_new_reference_files(argv[1], argv[2], argv[3], argv[4])

Reads a fasta file and cleans the contents of any characters that should not be there.

import sys
def read_and_fix(filename):
    f = open(filename)
    inseq = False
    alphabet = set([chr(i) for i in xrange(65, 91)])
    while True:
        c = f.read(1)
        if not c: break
        if inseq:
            if c==">":
                inseq = False
                sys.stdout.write("\n")
                sys.stdout.write(c)
            if c.upper() not in alphabet: continue
        if c=="\n":
            inseq = True
        sys.stdout.write(c)
    print
if __name__=="__main__":
    read_and_fix(sys.argv[1])

FASTafilter

Fasta fitler script, which finds a given sequence using the sequence name. First using grep, then jump to the file location to readout the sequence.

import subprocess
from sys import argv, exit
def findlocation(filename, search_term):
    out = subprocess.check_output("grep -b '{}' {}".format(search_term, filename), shell=True)
    matches = out.split("\n")[:-1]
    if len(matches)>1:
        return [int(match.split(":")[0]) for match in matches]
    return int(out.split(":")[0])
def collectsequence(filename, location):
    f = open(filename)
    f.seek(location)
    filebegun = False
    s = ""
    for line in f:
        if filebegun==False:
            if ">" in line:
                filebegun = True
                s += line
        else:
            if ">" in line: break
            s += line
    return s
def filtersequence(filename, location):
    f = open(filename)
    f.seek(location)
    filebegun = False
    for line in f:
        if filebegun==False:
            if ">" in line:
                filebegun = True
                print line,
        else:
            if ">" in line: break
            print line,
if __name__=="__main__":
    search_term, filename = argv[1:3]
    location = findlocation(filename, search_term)
    if type(location)==list:
        seqs = []
        for loc in location:
            #seqs.append([collectsequence(filename, loc), loc])
            filtersequence(filename, loc)
        #location = sorted(seqs, key=lambda x: len(x[0]))[-1][1]
    else:
        filtersequence(filename, location)

Joins a list of given genes, and cleans the sequence names.

import sys
from CorrectORF import readfasta
import os
from stat import S_ISFIFO
def collect_genes(genes):
    gene_files = []
    for gene in genes:
        sequence = readfasta(gene).values()
        if sequence:
            gene_files.append([gene, sequence[0].upper()])
    return gene_files
#def correct_genes(gene_files, reference):
#    for i in range(1, len(gene_files)):
#        gene_files[i][1] = clean_seq(gene_files[i][1].upper(), reference)
#    return gene_files
def write_out_fasta(genes):
    for gene in genes:
        print ">"+gene[0].split(".")[0].split("/")[-1]
        print gene[1]
if __name__=="__main__":
    if S_ISFIFO(os.fstat(0).st_mode):
        reference = "".join(sys.stdin.readlines()[1:]).replace("\n","")
        genes = sys.argv[1:]
        gene_files = collect_genes(genes)
        #gene_files = correct_genes(gene_files, reference)
        write_out_fasta(gene_files)
    else:
        genes = sys.argv[1:]
        gene_files = collect_genes(genes)
        #gene_files = correct_genes(gene_files)
        write_out_fasta(gene_files)
## COLLECT THE SEQUENCES
## REFERENCE IS ALWAYS THE FIRST SEQUENCE
## CORRECT ALL OF THE SEQUENCES ACCORDING TO THE REFERENCE
## WRITE OUT THE GENES.

Joins a list of given genes, and cleans the sequence names. It also translate the sequence into protein.

import sys
from CorrectORF import *
import os
from stat import S_ISFIFO
def collect_genes(genes):
    gene_files = []
    for gene in genes:
        sequence = readfasta(gene).values()
        if sequence:
            gene_files.append([gene, translate(sequence[0].upper())])
    return gene_files
def correct_genes(gene_files, reference):
    for i in range(1, len(gene_files)):
        gene_files[i][1] = clean_seq(gene_files[i][1].upper(), reference)
    return gene_files
def write_out_fasta(genes):
    for gene in genes:
        print ">"+gene[0].split(".")[0].split("/")[-1]
        print gene[1]
if __name__=="__main__":
    if S_ISFIFO(os.fstat(0).st_mode):
        reference = "".join(sys.stdin.readlines()[1:]).replace("\n","")
        genes = sys.argv[1:]
        gene_files = collect_genes(genes)
        #gene_files = correct_genes(gene_files, reference)
        write_out_fasta(gene_files)
    else:
        genes = sys.argv[1:]
        gene_files = collect_genes(genes)
        #gene_files = correct_genes(gene_files)
        write_out_fasta(gene_files)
## COLLECT THE SEQUENCES
## REFERENCE IS ALWAYS THE FIRST SEQUENCE
## CORRECT ALL OF THE SEQUENCES ACCORDING TO THE REFERENCE
## WRITE OUT THE GENES.

dNdS script

Calculates dNdS, p-distance and gives raw numbers back. It produces 3 different summary output files. dNdS, pdist and sites.

from sys import argv
from math import log
codon_map = {'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S',
             'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y',
             'TAA': '*', 'TAG': '*', 'TGT': 'C', 'TGC': 'C', 'TGA': '*',
             'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
             'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H',
             'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R',
             'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I',
             'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
             'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S',
             'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V',
             'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A',
             'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
             'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'}
def readfasta(filename):
    sequences = open(filename).read().split(">")[1:]
    sequences = [seq.split("\n", 1) for seq in sequences]
    return {seq[0]:seq[1].replace("\n", "") for seq in sequences}
def all_paths(wd, codon, gcodon):
    indices = filter(lambda x: x>-1, [w*(i+1)-1 for w, i in zip(wd, range(3))])
    pathways = []
    if len(indices)>1:
        for i in indices:
            ncodon = codon[:i]+gcodon[i]+codon[i+1:]
            b1, b2 = codon_map[codon], codon_map[ncodon]
            if b1=="*" or b2=="*":
                pathways.append(None)
                continue
            cpath = [0,0]
            if b1==b2:
                cpath[0] += 1
            else:
                cpath[1] += 1
            new_cases = wd[:i]+[False]+wd[i+1:]
            internal_paths = all_paths(new_cases, ncodon, gcodon)
            if internal_paths==None:
                pathways.append(None)
                continue
            if type(internal_paths[0])==int:
                internal_paths = [internal_paths]
            for path, i in zip(internal_paths, range(len(internal_paths))):
                if path==None:
                    pathways.append(None)
                    continue
                internal_paths[i][0] += cpath[0]
                internal_paths[i][1] += cpath[1]
                pathways.append(internal_paths[i])
    else:
        i = indices[0]
        ncodon = codon[:i]+gcodon[i]+codon[i+1:]
        b1, b2 = codon_map[codon], codon_map[ncodon]
        if b1=="*" or b2=="*":
            return None
        if b1==b2:
            return [1, 0]
        else:
            return [0, 1]
    return pathways
def count_pathways(codon1, codon2):
    s = 0
    n = 0
    which_different = []
    for b1, b2 in zip(codon1, codon2):
        if b1==b2:
            which_different.append(False)
        else:
            which_different.append(True)
    how_many_different = sum(which_different)
    pathways = all_paths(which_different, codon1, codon2)
    if pathways==None:
        return 0, 1
    if type(pathways[0])==int:
        pathways = [pathways]
    c = 0
    for pathway in pathways:
        if pathway == None: continue
        s += pathway[0]
        n += pathway[1]
        c += 1
    if c==0:
        return 0, 1
    return s/float(c), n/float(c)
def jukes_cantor(D):
    if D>=0.745: return 27
    return -3/float(4)*log(1-4/float(3)*D)
def countN_S(seq):
    seq_n = len(seq)
    print "Sequence length:", seq_n
    N, S = 0, 0
    otherbases = {'A': ['T', 'G', 'C'],
                  'T': ['A', 'C', 'G'],
                  'G': ['A', 'T', 'C'],
                  'C': ['A', 'T', 'G']}
    for i in range(0, seq_n, 3):
        codon = seq[i:i+3]
        n, s = 0, 0
        for j in range(0,3):
            aa = codon_map.get(codon, "-")
            if aa=="-": continue
            tn, ts = 0, 0
            for k in otherbases.get(codon[j], []):
                ncodon = codon[:j]+k+codon[j+1:]
                if aa==codon_map[ncodon]:
                    ts += 1
                else:
                    tn += 1
            n += tn/float(3)
            s += ts/float(3)
        N += n
        S += s
    return N, S
def calculate_dn_ds(seq1, seq2):
    seq_n = len(seq1)
    Sd, Nd = 0, 0
    S, N = 0, 0
    i = 1
    for i in range(0, seq_n, 3):
        codon1, codon2 = seq1[i:i+3], seq2[i:i+3]
        base1, base2 = codon_map.get(codon1, "-"), codon_map.get(codon2, "-")
        #print base1, base2
        if base1=="-" or base2=="-": continue
        if codon1!=codon2:
            #print codon1, codon2
            sdnd = count_pathways(codon1, codon2)
            #print sdnd
            Sd += sdnd[0]
            Nd += sdnd[1]
    N1, S1 = countN_S(seq1)
    N2, S2 = countN_S(seq2)
    N, S = (N1+N2)/float(2), (S1+S2)/float(2)
    if S==0 and N==0:
        return "NaN", (N, S)
    print "S & N", S, N
    print "Sd & Nd", Sd, Nd
    pS = Sd/float(S)
    pN = Nd/float(N)
    print "pS & pN", pS, pN
    #print "pN/pS ratio", pN/pS
    dS = jukes_cantor(pS)
    dN = jukes_cantor(pN)
    print "dS & dN", dS, dN
    if dS==0:
        return "NaN", (N, S)
    print "dN/dS ratio", dN/dS
    return dN/dS, (Nd, Sd, N, S)
def heterozygote_sites(seq):
    s = seq.replace("-", "")
    for char in ["A", "T", "G", "C", "N"]:
        s = s.replace(char, "")
    print s
    return len(s)
def pdistance(seq1, seq2):
    n, d = 0, 0
    for base1, base2 in zip(seq1.upper(), seq2.upper()):
        if base1=="-" or base2=="-": continue
        if base1!=base2:
            d += 1
        n += 1
    if n==0: return "NaN", (0,0)
    return d/float(n), (d, n)
def cleanKeyNames(keys):
    new_keys = []
    for key in keys:
        if "T"==key[2] and "T"==key[0]:
            new_keys.append(key[:4])
        elif "T"==key[0]:
            new_keys.append(key[:2])
        else:
            new_keys.append(key[:11])
    return keys, new_keys
def prettyprint(matrix):
    keys = sorted(matrix.keys())
    for key in [""]+keys:
        print "{:5}".format(key),
    print "\n",
    for key in keys:
        print "{:4}".format(key),
        for key2 in keys:
            print "%.3f" % matrix[key][key2],
        print
def write_out_matrix(filename, matrix):
    keys = sorted(matrix.keys())
    f = open(filename, "w")
    f.write("ID")
    for key in keys:
        f.write(", {}".format(key))
    f.write("\n")
    for key in keys:
        f.write("{}".format(key))
        for key2 in keys:
            if type(matrix[key][key2])==tuple:
                print(matrix[key][key2])
                f.write(", ({}".format(matrix[key][key2][0]))
                for item in matrix[key][key2][1:]:
                    f.write(";{}".format(item))
                f.write(")")
            else:
                f.write(", {}".format(matrix[key][key2]))
        f.write("\n")
    f.close()
if __name__=="__main__":
    sequences = readfasta(argv[1])
    done = []
    print sequences
    keys, samples = cleanKeyNames(sequences.keys())
    for key, sample in zip(keys, samples):
        sequences[sample] = sequences[key]
        #del sequences[key]
    print sequences
    for key in keys:
        if key not in samples:
            del sequences[key]
    print sequences
    dNdS = {}
    pdist = {}
    sites = {}
    dNdSinfo = {}
    for seq1 in sequences:
        if seq1 not in dNdS:
            dNdS[seq1] = {}
            pdist[seq1] = {}
            sites[seq1] = {}
            dNdSinfo[seq1] = {}
        for seq2 in sequences:
            dNdS[seq1][seq2], dNdSinfo[seq1][seq2] = calculate_dn_ds(sequences[seq1].upper(), sequences[seq2].upper())
            pdist[seq1][seq2], sites[seq1][seq2] = pdistance(sequences[seq1].upper(), sequences[seq2].upper())
    #prettyprint(dNdS)
    #prettyprint(pdist)
    write_out_matrix(argv[2], dNdS)
    write_out_matrix(argv[3], pdist)
    write_out_matrix(argv[4], sites)
    write_out_matrix(argv[5], dNdSinfo)

Joins a list of given genes, and cleans the sequence names.

from dNdS import *
if __name__=="__main__":
    sequences = readfasta(argv[1])
    done = []
    keys, samples = cleanKeyNames(sequences.keys())
    for key, sample in zip(sequences.keys(), samples):
        sequences[sample] = sequences[key]
        if key not in samples:
            del sequences[key]
    pdist = {}
    sites = {}
    for seq1 in sequences:
        if seq1 not in pdist:
            pdist[seq1] = {}
            sites[seq1] = {}
        for seq2 in sequences:
            pdist[seq1][seq2], sites[seq1][seq2] = pdistance(sequences[seq1], sequences[seq2])
    #prettyprint(dNdS)
    #prettyprint(pdist)
    write_out_matrix(argv[2], pdist)
    write_out_matrix(argv[3], sites)

Joins summary files returned from dNdS into one summary file.

from sys import argv
def read_csv_files(filenames):
    full_data = {}
    full_data["ID"] = []
    for filename in filenames:
        f = open(filename)
        for item in full_data:
            full_data[item].append("-")
        full_data["ID"][-1] = filename.split("/")[-1]
        row = {}
        for line in f:
            line = line.replace("\n", "")
            line = line.split(", ")
            if len(line)<2: continue
            if line[0]=="ID":
                header = line[1:]
                continue
            seq = line[0]
            for data, seq2 in zip(line[1:], header):
                if seq==seq2: continue
                combname = "|".join(sorted([seq,seq2]))
                if combname not in row:
                    row[combname] = data
        for item in row:
            if item not in full_data:
                full_data[item] = ["-" for i in range(len(full_data["ID"]))]
            full_data[item][-1] = row[item]
    return full_data
def write_out_data(filename, full_data):
    f = open(filename, "w")
    sequence_names = full_data["ID"]
    del full_data["ID"]
    keys = sorted(full_data.keys())
    nrows = len(sequence_names)
    f.write("ID")
    for key in keys:
        f.write(","+key)
    f.write("\n")
    for i in range(nrows):
        f.write(sequence_names[i])
        for key in keys:
            f.write(","+full_data[key][i])
        f.write("\n")
    f.close()
if __name__=="__main__":
    filenames = argv[1].split(",")
    combined_data = read_csv_files(filenames)
    write_out_data(argv[2], combined_data)

Gene annotation pipeline

Gene annotation pipeline which uses BRAKER2 to perform the annotation. Comparing the BRAKER2 annotation based on AUGUSTUS using RNAseq evidence and Medicago geneset, to previous annotations. The previous annotations where a Medicago based annotation that used GMAP to map medicago genes, and the first annotation. The main workflow built using gwf. Uses local installations of BRAKER2, BUSCO and gmap. The final output is a in the form of a .gtf file which has been curated based on evidence from the other gene annotations we have, and a BUSCO report.

from gwf import Workflow
gwf = Workflow()
def extract_sequence(reference, sequence_name, output):
    """
    Extract sequence takes in a <reference>, <sequence_name> and <output>
    The <reference> is a fasta file, which must be indexed. (have a .fai) file
    It looks up in the .fai file to find the <sequence_name>.
    If the name is present it reads the sequences from the <reference>,
    and saves it as <output>
    """
    inputs = [reference]
    outputs = [output]
    options = {
        'cores': 1,
        'memory': '4g',
        'account': 'NChain',
        'walltime': '01:00:00'
    }
    directory = "/".join(output.split("/")[:-1])
    spec = '''
    mkdir -p {dirc}
    python extract_sequence.py {ref} {seq} > {out}
    '''.format(ref=reference, seq=sequence_name, dirc=directory, out=output)
    return inputs, outputs, options, spec
def repeatmasking(reference, output_reference, directory):
    """
    """
    inputs = [reference]
    outputs = [output_reference]
    options = {
        'cores': 8,
        'memory': '24g',
        'account': 'NChain',
        'walltime': '08:00:00'
    }
    spec = '''
    ../../ANNOTATION/RepeatMasker/bin/RepeatMasker -xsmall -pa {cores} -dir {dirc} -species arabidopsis {ref}
    '''.format(ref=reference, cores=options['cores'], dirc=directory)
    return inputs, outputs, options, spec
def fasta_index(reference):
    inputs = [reference]
    outputs = [reference+".fai"]
    options = {
        'cores': 1,
        'memory': '2g',
        'account': 'NChain',
        'walltime': '01:00:00'
    }
    spec = '''
    source /com/extra/samtools/1.6.0/load.sh
    samtools faidx {ref}
    '''.format(ref=reference)
    return inputs, outputs, options, spec
gwf.target_from_template("RepensMasking",
                         repeatmasking("repens/TrR.v5.fasta",
                                       "repens/TrR.v5.fasta.masked",
                                       "repens"))
gwf.target_from_template("OccidentaleMasking",
                         repeatmasking("occidentale/To.fasta",
                                       "occidentale/To.fasta.masked",
                                       "occidentale"))
gwf.target_from_template("PallescensMasking",
                         repeatmasking("pallescens/Tp.fasta",
                                       "pallescens/Tp.fasta.masked",
                                       "pallescens"))
gwf.target_from_template("RepensMaskindex",
                         fasta_index("repens/TrR.v5.fasta.masked"))
gwf.target_from_template("OccidentaleMaskindex",
                         fasta_index("occidentale/To.fasta.masked"))
gwf.target_from_template("PallescensMaskindex",
                         fasta_index("pallescens/Tp.fasta.masked"))
def get_sequence_names(reference):
    f = open(reference+".fai")
    sequence_names = []
    for line in f:
        sequence_names.append(line.split("\t")[0])
    return sequence_names
white_clover_chromosomes = get_sequence_names("repens/TrR.v5.fasta")
occidentale_chromosomes = get_sequence_names("occidentale/To.fasta")
pallescens_chromosomes = get_sequence_names("pallescens/Tp.fasta")
n = 10
print("White clover, chromosomes:", len(white_clover_chromosomes))
print("Occidentale, chromosomes:", len(occidentale_chromosomes))
print("Pallescens, chromosomes:", len(pallescens_chromosomes))
for chromosome in white_clover_chromosomes:
    gwf.target_from_template("Repens"+chromosome+"Extract",
                             extract_sequence("repens/TrR.v5.fasta.masked",
                                              chromosome,
                                              "runs/repens/"+chromosome+"/"+chromosome+".fa"))
#for chromosome in occidentale_chromosomes[:n]:
#     gwf.target_from_template("Occidentale"+chromosome+"Extract",
#                              extract_sequence("occidentale/To.fasta.masked",
#                                               chromosome,
#                                               "runs/occidentale/"+chromosome+"/"+chromosome+".fa"))
#for chromosome in pallescens_chromosomes[:n]:
#     gwf.target_from_template("Pallescens"+chromosome+"Extract",
#                              extract_sequence("pallescens/final_pallescens.fa.masked",
#                                               chromosome,
#                                               "runs/pallescens/"+chromosome+"/"+chromosome+".fa"))
gwf.target("OccidentaleSequence", inputs=["occidentale/To.fasta.masked"],
           outputs=["runs/occidentale/occidentalefull/To.fasta"]) << """
mkdir -p runs/occidentale/occidentalefull
cp occidentale/To.fasta.masked runs/occidentale/occidentalefull/To.fasta
"""
gwf.target("PallesecensSequence", inputs=["pallescens/Tp.fasta.masked"],
           outputs=["runs/pallescens/pallescensfull/Tp.fasta"]) << """
mkdir -p runs/pallescens/pallescensfull
cp pallescens/Tp.fasta.masked runs/pallescens/pallescensfull/Tp.fasta
"""
def filter_bam_file(bamfile, chromosome, outfile):
    """
    filter_bam_file uses samtools to read a <bamfile> and read only
    the reads that are mapped to <chromosome>.
    It saves the filtered reads into <outfile>.
    """
    inputs = [bamfile]
    outputs = [outfile]
    options = {
        'cores': 1,
        'memory': '4g',
        'account': 'NChain',
        'walltime': '01:00:00'
    }
    directory = "/".join(outfile.split("/")[:-1])
    spec = '''
    source /com/extra/samtools/1.6.0/load.sh
    mkdir -p {dirc}
    samtools view -b {infile} {chrom} > {out}
    '''.format(infile=bamfile, chrom=chromosome, out=outfile, dirc=directory)
    return inputs, outputs, options, spec
for chromosome in white_clover_chromosomes:
    gwf.target_from_template("Repens"+chromosome+"BamExtract",
                             filter_bam_file("RNAdata/repens/pooled.reads.bam",
                                             chromosome,
                                             "runs/repens/"+chromosome+"/"+chromosome+".bam"))
# for chromosome in occidentale_chromosomes[:n]:
#      gwf.target_from_template("Occidentale"+chromosome+"BamExtract",
#                               filter_bam_file("RNAdata/occidentale/pooled_reads.bam",
#                                               chromosome,
#                                               "runs/occidentale/"+chromosome+"/"+chromosome+".bam"))
# for chromosome in pallescens_chromosomes[:n]:
#      gwf.target_from_template("Pallescens"+chromosome+"BamExtract",
#                               filter_bam_file("RNAdata/pallescens/pooled_reads.bam",
#                                               chromosome,
#                                               "runs/pallescens/"+chromosome+"/"+chromosome+".bam"))
gwf.target("OccidentaleReads", inputs=["RNAdata/occidentale/pooled_reads.bam"],
           outputs=["runs/occidentale/occidentalefull/pooled_reads.bam"]) << """
mkdir -p runs/occidentale/occidentalefull
cp RNAdata/occidentale/pooled_reads.bam runs/occidentale/occidentalefull/pooled_reads.bam
"""
gwf.target("PallesecensReads", inputs=["RNAdata/pallescens/pooled_reads.bam"],
           outputs=["runs/pallescens/pallescensfull/pooled_reads.bam"]) << """
mkdir -p runs/pallescens/pallescensfull
cp RNAdata/pallescens/pooled_reads.bam runs/pallescens/pallescensfull/pooled_reads.bam
"""
def BRAKER_gene_annotation(reference, bamfile, proteinfile, directory, species, outfile, time='12:00:00', cores=4):
    """
    Gene annoation using BRAKER.
    """
    inputs = [reference, bamfile, proteinfile]
    outputs = [outfile]
    options = {
        'cores': cores,
        'memory': '16g',
        'account': 'NChain',
        'walltime': time
    }
    levels = len(directory.split("/"))+1
    bamfile = bamfile.split("/")[-1]
    reference = reference.split("/")[-1]
    proteinfile = "../"*(levels-1)+proteinfile
    
    spec = '''
    cd {directory}
    
    source activate BRAKER
    export GENEMARK_PATH=/faststorage/project/NChain/WHITE_CLOVER/BRAKER_ANNOTATION/gm_et_linux_64/gmes_petap
    export ALIGNMENT_TOOL_PATH=/faststorage/project/NChain/WHITE_CLOVER/BRAKER_ANNOTATION/gth-1.7.1-Linux_x86_64-64bit/bin
    export AUGUSTUS_BIN_PATH=/home/marnit/anaconda3/envs/BRAKER/bin/
    export AUGUSTUS_CONFIG_PATH=/home/marnit/anaconda3/envs/BRAKER/config/
    export AUGUSTUS_SCRIPTS_PATH=/home/marnit/anaconda3/envs/BRAKER/scripts/
    export PERL5LIB=/home/marnit/anaconda3/envs/BRAKER/lib/site_perl/5.26.2
    export PERL5LIB="/home/marnit/anaconda3/envs/BRAKER/lib/site_perl/5.26.2/x86_64-linux-thread-multi:$PERL5LIB"
    export PATH="/home/marnit/anaconda3/envs/BRAKER/bin/:$PATH"
    {calldir}BRAKER_v2.1.0/braker.pl --genome={ref} --prot_seq={prot} --prg=gth --bam={bam} --cores={cores} --species={species} --softmasking --useexisting
    '''.format(directory=directory, calldir="../"*levels, species=species,
               cores=options['cores'], ref=reference, bam=bamfile, prot=proteinfile)
    return inputs, outputs, options, spec
for chromosome in white_clover_chromosomes:
    gwf.target_from_template("Repens"+chromosome+"Annotation",
                             BRAKER_gene_annotation("runs/repens/"+chromosome+"/"+chromosome+".fa",
                                                    "runs/repens/"+chromosome+"/"+chromosome+".bam",
                                                    "MedicagoProtein/Mt.proteins.fa",
                                                    "runs/repens/"+chromosome,
                                                    "repens"+chromosome,
                                                    "runs/repens/"+chromosome+"/braker/repens"+chromosome+"/augustus.hints.gtf"))
# for chromosome in occidentale_chromosomes[:n]:
#     gwf.target_from_template("Occidentale"+chromosome+"Annotation",
#                              BRAKER_gene_annotation("runs/occidentale/"+chromosome+"/"+chromosome+".fa",
#                                                     "runs/occidentale/"+chromosome+"/"+chromosome+".bam",
#                                                     "MedicagoProtein/Mt.proteins.fa",
#                                                     "runs/occidentale/"+chromosome,
#                                                     "occidentale"+chromosome,
#                                                     "runs/occidentale/"+chromosome+"/braker/occidentale"+chromosome+"/augustus.hints.gtf"))
# for chromosome in pallescens_chromosomes[:n]:
#     gwf.target_from_template("Pallescens"+chromosome+"Annotation",
#                              BRAKER_gene_annotation("runs/pallescens/"+chromosome+"/"+chromosome+".fa",
#                                                     "runs/pallescens/"+chromosome+"/"+chromosome+".bam",
#                                                     "MedicagoProtein/Mt.proteins.fa",
#                                                     "runs/pallescens/"+chromosome,
#                                                     "pallescens"+chromosome,
#                                                     "runs/pallescens/"+chromosome+"/braker/pallescens"+chromosome+"/augustus.hints.gtf"))
gwf.target_from_template("OccidentaleAnnotation",
                         BRAKER_gene_annotation("runs/occidentale/occidentalefull/To.fasta",
                                                "runs/occidentale/occidentalefull/pooled_reads.bam",
                                                "MedicagoProtein/Mt.proteins.fa",
                                                "runs/occidentale/occidentalefull",
                                                "occidentale",
                                                "runs/occidentale/occidentalefull/braker/occidentale/augustus.hints.gtf",
                                                "48:00:00",
                                                8))
gwf.target_from_template("PallescensAnnotation",
                         BRAKER_gene_annotation("runs/pallescens/pallescensfull/Tp.fasta",
                                                "runs/pallescens/pallescensfull/pooled_reads.bam",
                                                "MedicagoProtein/Mt.proteins.fa",
                                                "runs/pallescens/pallescensfull",
                                                "pallescens",
                                                "runs/pallescens/pallescensfull/braker/pallescens/augustus.hints.gtf",
                                                "48:00:00",
                                                8))
def join_hints(hints, output):
    """ """
    inputs = hints
    outputs = [output]
    options = {
        'cores': 1,
        'memory': '4g',
        'account': 'NChain',
        'walltime': '01:00:00'
    }
    spec = "cat"
    for hint in hints:
        spec += " {}".format(hint)
    spec += " > {}".format(output)
    return inputs, outputs, options, spec
repens_hints = []
for chromosome in white_clover_chromosomes[1:]:
    repens_hints.append("runs/repens/"+chromosome+"/braker/repens"+chromosome+"/augustus.hints.gtf")
gwf.target_from_template("JoinRepensAnnotation",
                         join_hints(repens_hints,
                                    "runs/repens/repens.annotation.gtf"))
# occidentale_hints = []
# for chromosome in occidentale_chromosomes[:n]:
#      occidentale_hints.append("runs/occidentale/"+chromosome+"/braker/occidentale"+chromosome+"/augustus.hints.gtf")
# gwf.target_from_template("JoinOccidentaleAnnoation",
#                          join_hints(occidentale_hints,
#                                     "runs/occidentale/occidentale.annotation.gtf"))
# pallescens_hints = []
# for chromosome in pallescens_chromosomes[:n]:
#      pallescens_hints.append("runs/pallescens/"+chromosome+"/braker/pallescens"+chromosome+"/augustus.hints.gtf")
# gwf.target_from_template("JoinPallescensAnnoation",
#                           join_hints(pallescens_hints,
#                                      "runs/pallescens/pallescens.annotation.gtf"))
gwf.target("OccidentaleFinal", inputs=["runs/occidentale/occidentalefull/braker/occidentale/augustus.hints.gtf"],
           outputs=["runs/occidentale/occidentale.annotation.gtf"]) << """
cp runs/occidentale/occidentalefull/braker/occidentale/augustus.hints.gtf runs/occidentale/occidentale.annotation.gtf
"""
gwf.target("PallesecensFinal", inputs=["runs/pallescens/pallescensfull/braker/pallescens/augustus.hints.gtf"],
           outputs=["runs/pallescens/pallescens.annotation.gtf"]) << """
cp runs/pallescens/pallescensfull/braker/pallescens/augustus.hints.gtf runs/pallescens/pallescens.annotation.gtf
"""
def GMAP_prepare(reference, dbname):
    """ """
    inputs = [reference]
    outputs = [dbname]
    options = {
        'cores': 1,
        'memory': '12g',
        'account': 'NChain',
        'walltime': '12:00:00'
    }
    directory = dbname.split("/")[0]
    spec = '''
    source /com/extra/gmap/2017-08-15/load.sh
    mkdir -p {dirc}
    gmap_build -d {dbname} {ref} -D $PWD/{dirc}
    '''.format(dbname=dbname.split("/")[-1], ref=reference, dirc=directory)
    return inputs, outputs, options, spec
gwf.target_from_template("OccidentaleDB",
                         GMAP_prepare("occidentale/To.fasta", "gmap/occidentale.gmap"))
gwf.target_from_template("PallescensDB",
                         GMAP_prepare("pallescens/Tp.fasta", "gmap/pallescens.gmap"))
def GMAP_mapping(dbname, genes, gfffile):
    """ """
    inputs = [dbname, genes]
    outputs = [gfffile]
    options = {
        'cores': 8,
        'memory': '12g',
        'account': 'NChain',
        'walltime': '24:00:00'
    }
    directory = dbname.split("/")[0]
    spec = '''
    source /com/extra/gmap/2017-08-15/load.sh
    gmap -d {} -D $PWD/{} -t 8 -f 2 {} > {}
    '''.format(dbname.split("/")[1], directory, genes, gfffile, gfffile, gfffile)
    return inputs, outputs, options, spec
gwf.target_from_template("OccidentaleMedicago",
                         GMAP_mapping("gmap/occidentale.gmap", "MedicagoProtein/Mt4.0v2_Genes.fasta",
                                      "gmap/occidentale.Mt.gff"))
gwf.target_from_template("PallescensMedicago",
                         GMAP_mapping("gmap/pallescens.gmap", "MedicagoProtein/Mt4.0v2_Genes.fasta",
                                      "gmap/pallescens.Mt.gff"))
def gff3plsorting(gff_file, outgff_file):
    """ """
    inputs = [gff_file]
    outputs = [outgff_file]
    options = {
        'cores': 1,
        'memory': '4g',
        'account': 'NChain',
        'walltime': '01:00:00'
    }
    spec = '''
    /home/marnit/bin/gff3sort.pl --chr_order natural {infile} > {outfile}
    '''.format(infile=gff_file, outfile=outgff_file)
    return inputs, outputs, options, spec
gwf.target_from_template("OccidentaleMtSort",
                         gff3plsorting("gmap/occidentale.Mt.gff", "gff_files/occidentale.Mt.gff"))
gwf.target_from_template("PallescensMtSort",
                         gff3plsorting("gmap/pallescens.Mt.gff", "gff_files/pallescens.Mt.gff"))
def gffsorting(gff_file, outgff_file):
    """ """
    inputs = [gff_file]
    outputs = [outgff_file]
    options = {
        'cores': 1,
        'memory': '4g',
        'account': 'NChain',
        'walltime': '01:00:00'
    }
    spec = '''
    python GFFsort.py -o  {outfile} {infile}
    '''.format(infile=gff_file, outfile=outgff_file)
    return inputs, outputs, options, spec
gwf.target_from_template("OccidentaleOldSort",
                         gffsorting("gff_files/To.v5.gff", "gff_files/To.v5.sorted.gff"))
gwf.target_from_template("PallescensOldSort",
                         gffsorting("gff_files/Tp.v4.gff", "gff_files/Tp.v4.sorted.gff"))
def copy_files(infiles, outfiles):
    """ """
    inputs = infiles
    outputs = outfiles
    options = {
        'cores': 1,
        'memory': '4g',
        'account': 'NChain',
        'walltime': '01:00:00'
    }
    spec = ''
    for inf, outf in zip(infiles, outfiles):
        spec += "cp {} {}\n".format(inf,outf)
    return inputs, outputs, options, spec
    
gwf.target_from_template("MovingFinalGFFfiles",
                         copy_files(["runs/repens/repens.annotation.gtf",
                                     "runs/occidentale/occidentale.annotation.gtf",
                                     "runs/pallescens/pallescens.annotation.gtf"],
                                    ["gff_files/repens.braker.gtf",
                                     "gff_files/occidentale.braker.gtf",
                                     "gff_files/pallescens.braker.gtf"]))
def ANNOTATIONcleaning(main_gff_file, additional_gff_files, output, confidence_level, binsize=25000):
    """ """
    inputs = [main_gff_file]+additional_gff_files
    outputs = [output]
    options = {
        'cores': 1,
        'memory': '4g',
        'account': 'NChain',
        'walltime': '01:00:00'
    }
    spec = '''
    source activate python2
python ANNOTATIONmerger.py -h
python ANNOTATIONcleaner.py -o {outfile} -c {conflvl} -b {binsize} {main_gff}'''.format(outfile=output, conflvl=confidence_level,
                                                                                        binsize=binsize, main_gff=main_gff_file)
    for agf in additional_gff_files:
        spec += " {}".format(agf)
    return inputs, outputs, options, spec
gwf.target_from_template("RepensCleaning",
                         ANNOTATIONcleaning("gff_files/repens.braker.gtf",
                                            ["gff_files/TrR.Mt.gff", "gff_files/TrR.v5.sorted.gff"],
                                            "gff_files/repens.final.gtf",
                                            0.25))
gwf.target_from_template("OccidentaleCleaning",
                         ANNOTATIONcleaning("gff_files/occidentale.braker.gtf",
                                            ["gff_files/To.v5.sorted.gff", "gff_files/occidentale.Mt.gff"],
                                            "gff_files/occidentale.final.gtf",
                                            0.25))
gwf.target_from_template("PallescensCleaning",
                         ANNOTATIONcleaning("gff_files/pallescens.braker.gtf",
                                            ["gff_files/Tp.v4.sorted.gff", "gff_files/pallescens.Mt.gff"],
                                            "gff_files/pallescens.final.gtf",
                                            0.25))
def BUSCO(ingff, reference, proteinfile, dbname, report):
    """ """
    inputs = [ingff, reference]
    outputs = [proteinfile, report]
    options = {
        'cores': 4,
        'memory': '4g',
        'account': 'NChain',
        'walltime': '12:00:00'
    }
    proteinfilename = proteinfile.split("/")[-1]
    
    spec = '''
    source /com/extra/cufflinks/2.2.1/load.sh
    source activate busco
    gffread -y {protfile} -g {ref} {ingff}
    
    cd busco
    run_BUSCO.py -i {protfilename} -o {name} -l ../../busco/sample_data/embryophyta_odb9/ -m proteins -c 4 > {report}
    '''.format(protfile=proteinfile, protfilename=proteinfilename, ref=reference, ingff=ingff, name=dbname, report=report)
    return inputs, outputs, options, spec
gwf.target_from_template("RepensBUSCO",
                         BUSCO("gff_files/repens.final.gtf",
                               "repens/TrR.v5.fasta",
                               "busco/repens.proteins.fa",
                               "RepensFinal",
                               "RepensFinalReport.out"))
gwf.target_from_template("OccidentaleBUSCO",
                         BUSCO("gff_files/occidentale.final.gtf",
                               "occidentale/To.fasta",
                               "busco/occidentale.proteins.fa",
                               "OccidentaleFinal",
                               "OccidentaleFinalReport.out"))
gwf.target_from_template("PallescensBUSCO",
                         BUSCO("gff_files/pallescens.final.gtf",
                               "pallescens/Tp.fasta",
                               "busco/pallescens.proteins.fa",
                               "PallescensFinal",
                               "PallescensFinalReport.out"))

Efficient sequence reader from a fasta file. Used to seperate the chromosomes into seperate files to speed up the pipeline.

from sys import argv, exit, stdout
def read_index(filename):
    f = open(filename)
    indices = {}
    for line in f:
        chrom, _, byteindex, _, _ = line.split("\t")
        indices[chrom] = int(byteindex)
    return indices
def read_sequence(filename, index, target):
    f = open(filename)
    f.seek(index-len(target)-2)
    while True:
        c = f.read(1)
        if c==">":
            stdout.write(">")
            break
    stdout.write(f.readline())
    for line in f:
        if line[0]==">": break
        stdout.write(line)
if __name__=="__main__":
    if len(argv)==3:
        reference, target = argv[1:]
    else:
        print("USAGE: python extract_sequence.py <reference> <sequence_name>")
        exit()
    try:
        indices = read_index(reference+".fai")
    except:
        print("No index file included or file does not exist")
        exit()
    if target in indices:
        read_sequence(reference, indices[target], target)
    else:
        print("Sequence name not found!")
        exit()

Loads all of the exons from the additional gff files in to a hash(dictionary) for exact matches. For inexact it builds an exon map with hash lookup based on a binsize, and then compares the overlap of all the exons inside the bin. Computes a confidence based on exact and inexact matches, weighing more on exact matches (all exact also have inexact matches).

from ANNOTATIONmerger import *
from math import floor
def overlap(exon1, exon2):
    _, start1, stop1, dirc1 = exon1
    _, start2, stop2, dirc2 = exon2
    dirc = dirc1==dirc2
    left = stop2>start1 and start2<=start1
    right = start2<stop1 and stop2>=stop1
    in1 = start1<=start2 and stop1>=stop2
    in2 = start2<=start1 and stop2>=stop1
    if dirc and (left or right or in1 or in2):
        return 1
    else:
        return 0
def overlap_confidence(exon1, exon2):
    _, start1, stop1, dirc1 = exon1
    _, start2, stop2, dirc2 = exon2
    if dirc1!=dirc2: return 0
    area = min(stop1, stop2)-max(start1, start2)
    if area<0: area = 0
    return area
if __name__=="__main__":
    _current_version = "0.2.7"
    usage = '''
    usage: python \033[4m%prog\033[24m \033[38;5;74m[options]\033[39m \033[32m <main gff file> <gff files to filter by>\033[39m'''
    parser = OptionParser(usage)
    #parser.add_option('-R', type="string", nargs=1, dest="RNAdepth",
    #                  help="RNA read depth file. <Created by samtools depth -aa reads.bam>")
    #parser.add_option('-Q', type="int", nargs=1, dest="QueueSize", default=15,
    #                  help="Maximum Queue size. Ability to look back to previous transcripts. Larger Q increases runtime and space consumption. Also allows for better comparisons of iso-forms. Default is set to 15.")
    parser.add_option('-c', type="string", nargs=1, dest="Cutoff", default="0.5",
                      help="Confidence level cutoff. Default is set to 0.5.")
    parser.add_option('-b', type="string", nargs=1, dest="BinSize", default="25000",
                      help="bin size. Default is set to 25 kb (30000).")
    parser.add_option('-o', type="string", nargs=1, dest="Output", default="output.gtf",
                      help="Output file name. Default: output.gtf")
    pretty_print("======= ANNOTATION CLEANER (v{}) =======".format(_current_version), "orange")
    
    options, args = parser.parse_args()
    
    _cut_off = float(options.Cutoff)
    _outfile_name = options.Output
    _bin_size = int(options.BinSize)
    _queue_size = 1
    pretty_print("Cutoff level set to {} and Bin size set to {}".format(_cut_off, _bin_size), "red")
    pretty_print("Output will be saved into: {}".format(_outfile_name), "red")
    pretty_print("OPENING GFF/GTF FILES", "lightblue")
    gene_annotations = []
    filenames = []
    for arg in args:
        if ".gff" in arg or ".gtf" in arg:
            filenames.append(arg)
    main_gff = GFF(filenames[0])
    pretty_print("\tMain gff file: "+filenames[0])
            
    for filename in filenames[1:]:
        pretty_print("\tAdditional gff files: "+filename)
        gene_annotations.append(GFF(filename))
    pretty_print("READING EXONS FROM THE ADDITIONAL FILES", "lightblue")
    exon_hash = {}
    exon_map = {}
    display_counter = 1
    display_by = 25000
    for ga in gene_annotations:
        while not ga._empty:
            if (display_counter % display_by)==0:
                print("\t{} transcripts read through".format(display_counter))
            gene = ga.get_gene()
            if 'exon' in gene.elements:
                for exon in gene.elements['exon']:
                    string_representation = "%r" % (exon)
                    exon_hash[string_representation] = 0
                    chrom, start, stop, dirc = exon
                    
                    if chrom not in exon_map:
                        exon_map[chrom] = {}
                    if dirc not in exon_map[chrom]:
                        exon_map[chrom][dirc] = {}
                    sbin = int(floor(start/_bin_size))*_bin_size
                    stbin = int(floor(stop/_bin_size))*_bin_size
                    if sbin==stbin:
                        if sbin not in exon_map[chrom][dirc]:
                            exon_map[chrom][dirc][sbin] = []
                        exon_map[chrom][dirc][sbin].append(exon)
                    else:
                        if sbin not in exon_map[chrom][dirc]:
                            exon_map[chrom][dirc][sbin] = []
                        exon_map[chrom][dirc][sbin].append(exon)
                        if stbin not in exon_map[chrom][dirc]:
                            exon_map[chrom][dirc][stbin] = []
                        exon_map[chrom][dirc][stbin].append(exon)
                    
            if 'cds' in gene.elements:
                for exon in gene.elements['cds']:
                    string_representation = "%r" % (exon)
                    exon_hash[string_representation] = 0
                    chrom, start, stop, dirc = exon
                    
                    if chrom not in exon_map:
                        exon_map[chrom] = {}
                    if dirc not in exon_map[chrom]:
                        exon_map[chrom][dirc] = {}
                    sbin = int(floor(start/_bin_size))*_bin_size
                    stbin = int(floor(stop/_bin_size))*_bin_size
                    if sbin==stbin:
                        if sbin not in exon_map[chrom][dirc]:
                            exon_map[chrom][dirc][sbin] = []
                        exon_map[chrom][dirc][sbin].append(exon)
                    else:
                        if sbin not in exon_map[chrom][dirc]:
                            exon_map[chrom][dirc][sbin] = []
                        exon_map[chrom][dirc][sbin].append(exon)
                        if stbin not in exon_map[chrom][dirc]:
                            exon_map[chrom][dirc][stbin] = []
                        exon_map[chrom][dirc][stbin].append(exon)
            display_counter += 1
    pretty_print("FILTERING MAIN GFF FILE", "lightblue")
    outgff = GFFwriter(_outfile_name, _current_version)
    total_gene_count = 0
    passed_genes = 0
    passed_genes_sub1 = 0
    passed_genes_sub2 = 0
    display_counter = 1
    display_by = 25000
    subgenome1 = set(['chr1', 'chr2', 'chr3', 'chr4',
                      'chr5', 'chr6', 'chr7', 'chr8'])
    subgenome2 = set(['chr9', 'chr10', 'chr11', 'chr12',
                      'chr13', 'chr14', 'chr15', 'chr16'])
    
    while not main_gff._empty:
        if (display_counter % display_by)==0:
                print("\t{} transcripts read through".format(display_counter))
        total_gene_count += 1
        gene = main_gff.get_gene()
        largest_type = max((len(gene.get('cds', [])), 'cds'),
                           (len(gene.get('exon', [])), 'exon'),
                           key=lambda x: x[0])[1]
        exon_count = 0
        exon_overlap_count = 0
        exon_overlap_score = 0
        for exon in gene[largest_type]:
            if "%r" % (exon) in exon_hash:
                exon_count += 1
            chrom, start, stop, dirc = exon
            if chrom in exon_map and dirc in exon_map.get(chrom, {}):
                sbin = int(floor(start/_bin_size))*_bin_size
                stbin = int(floor(stop/_bin_size))*_bin_size
                if sbin==stbin:
                    for test_exon in exon_map[chrom][dirc].get(sbin, []):
                        if overlap(exon, test_exon):
                            exon_overlap_count += 1
                            break
                        #exon_overlap_score += overlap_confidence(exon, test_exon)
                else:
                    for test_exon in exon_map[chrom][dirc].get(sbin, []):
                        if overlap(exon, test_exon):
                            exon_overlap_count += 1
                            break
                    for test_exon in exon_map[chrom][dirc].get(stbin, []):
                        if overlap(exon, test_exon):
                            exon_overlap_count += 1
                            break
        
        confidence = (exon_count+exon_overlap_count)/float(2*len(gene[largest_type]))
        
        if confidence>=_cut_off:
            passed_genes += 1
            if gene.chromosome in subgenome1:
                passed_genes_sub1 += 1
            if gene.chromosome in subgenome2:
                passed_genes_sub2 += 1
            outgff.write_gene(gene, confidence, filenames[0])
        display_counter += 1
    pretty_print("FILE STATISTICS", "lightblue")
    pretty_print("\tTotal gene number: {}".format(total_gene_count))
    pretty_print("\tPassed genes: {}".format(passed_genes))
    pretty_print("\tPassed genes in first subgenome: {}".format(passed_genes_sub1))
    pretty_print("\tPassed genes in second subgenome: {}".format(passed_genes_sub2))
    pretty_print("\tTotal: {}".format(passed_genes_sub1+passed_genes_sub2))

First attempt at doing a merging of the multiple gff files. Currently only used for the classes and functions, inside on ANNOTATIONcleaner.py

#import sys
from optparse import OptionParser
#from matplotlib import pyplot
class Queue:
    def __init__(self, queue_size):
        self._list = list()
        self._max_size = queue_size
        self.full = False
    def add(self, value):
        if self.full:
            self._list.pop(0)
            self._list.append(value)
        else:
            self._list.append(value)
            if len(self._list)==self._max_size:
                self.full = True
    def get(self, element):
        if element>self._max_size: return None
        if len(self._list)<element: return None
        return self._list[-element]
class Gene:
    def __init__(self, information):
        self.elements = {}
        self.new_gene(information)
    def new_gene(self, information):
        chrom, _, element_type, start, stop, _, direction = information[:7]
        self.elements['transcript'] = [chrom, int(start), int(stop), direction]
        self.direction = direction
        self.start = int(start)
        self.stop = int(stop)
        self.chromosome = chrom
        if "_" in chrom:
            self.chrom_number = int(chrom.split("_")[-1])
        else:
            self.chrom_number = int(chrom.split("chr")[-1])
        self._lines = [information]
    def add_element(self, information):
        chrom, _, element_type, start, stop, _, direction = information[:7]
        element_type = element_type.lower()
        if self.direction==".":
            self.direction = direction
        #print self.__contains__([chrom, start, stop, direction])
        if [chrom, int(start), int(stop), direction] in self:
            self._lines.append(information)
            if element_type.lower() not in self.elements:
                self.elements[element_type.lower()] = list()
            self.elements[element_type.lower()].append([chrom, int(start), int(stop), direction])
    def __contains__(self, gene_info):
        if type(gene_info)==Gene:
            return gene_info.direction==self.direction and gene_info.chromosome==self.chromosome and (self.start>=gene_info.start and self.stop<=gene_info.stop)
        if type(gene_info)==list or type(gene_info)==tuple:
            if len(gene_info)==4:
                c, start, stop, d = gene_info
                return d==self.direction and c==self.chromosome and self.start<=start and self.stop>=stop
            else:
                print "WARNING: If list input must be [chromosome, start, stop, direction]"
                return None
    def overlaps(self, gene):
        chrom = gene.chromosome==self.chromosome
        dirc = gene.direction==self.direction
        left = gene.stop>self.start and gene.start<=self.start
        right = gene.start<self.stop and gene.stop>=self.stop
        return (chrom and dirc) and (left or right or (gene in self) or (self in gene))
    def __lt__(self, gene):
        if self.chromosome!=gene.chromosome:
            return self.chrom_number<gene.chrom_number
        return self.stop<gene.stop
    def __le__(self, gene):
        if self.chromosome!=gene.chromosome:
            return self.chrom_number<=gene.chrom_number
        return self.stop<=gene.stop
    def __gt__(self, gene):
        if self.chromosome!=gene.chromosome:
            return self.chrom_number>gene.chrom_number
        return self.stop>gene.stop
    def __ge__(self, gene):
        if self.chromosome!=gene.chromosome:
            return self.chrom_number>=gene.chrom_number
        return self.stop>=gene.stop
    def __eq__(self, gene):
        if self.chromosome!=gene.chromosome:
            return False
        return self.stop==gene.stop and self.start==gene.start
    def __neq__(self, gene):
        return not self.__eq__(gene)
    def __len__(self):
        return self.stop-self.start
    def __getitem__(self, item):
        return self.elements[item]
    def get(self, item, default=None):
        if item not in self.elements:
            return default
        return self.elements[item]
    def coding_region_length(self):
        cds_length = 0
        exon_length = 0
        if 'cds' in self.elements:
            for cds in self.elements['cds']:
                cds_length += cds[2]-cds[1]
        if 'exon' in self.elements:
            for exon in self.elements['exon']:
                exon_length += exon[2]-exon[1]
        return max(cds_length, exon_length)
    def __str__(self):
        s = "GENE TRANSCRIPT\n"
        s += "\tChromosome: {}, Start: {},".format(self.chromosome,
                                                   self.start)
        s += " Stop: {}, Direction: {}\n".format(self.stop,
                                                 self.direction)
        if 'cds' in self.elements:
            s += "\tNumber of cds: {}\n".format(len(self.elements['cds']))
        if 'exon' in self.elements:
            s += "\tNumber of exons: {}\n".format(len(self.elements['exon']))
        return s
        
class GFF:
    def __init__(self, filename, queue_size=1):
        if ".gff" in filename: self._filetype = "GFF"
        if ".gtf" in filename: self._filetype = "GTF"
        self._filename = filename
        self._file = open(filename)
        self._Queue = Queue(queue_size)
        self._empty = False
    def get_gene(self):
        if self._empty: return None
        in_gene = False
        while True:
            self._last_line = self._file.tell()
            line = self._file.readline()
            if line=='':
                self._empty = True
                break
            if line[0]=="#": continue
            elements = line.split("\t")
            chrom, _, element_type, start, stop = elements[:5] 
            if in_gene:
                if element_type=="gene" or element_type=="transcript":
                    self._file.seek(self._last_line)
                    break
                gene.add_element(elements)
            else:
                if element_type=="gene" or element_type=="transcript":
                    in_gene = True
                    gene = Gene(elements)
        self._Queue.add(gene)
        return gene
    def get_last_gene(self):
        return self._Queue.get(1)
    def __str__(self):
        return "<--{} file with filename {}-->".format(self._filetype, self._filename)
def test_overlaps(genes):
    overlaps = []
    for i, gene_1 in enumerate(genes):
        for j, gene_2 in enumerate(genes[i:]):
            if i==(j+i): continue
            overlaps.append(gene_1.overlaps(gene_2))
    return overlaps
def get_furthest_gene(genes, empty):
    smaller_than_counts = []
    for i, gene_1 in enumerate(genes):
        counter = 0
        for j, gene_2 in enumerate(genes):
            if i==j: continue
            if empty[i]:
                counter -= 1
                continue
            if gene_1<gene_2: counter += 1
        smaller_than_counts.append(counter)
    return max(enumerate(smaller_than_counts), key=lambda x: x[1])[0]
def exon_overlap(exon1list, exon2list):
    if exon1list==[] or exon2list==[]:
        return 0
    
    n, m = len(exon1list), len(exon2list)
    
    def overlap(exon1, exon2):
        _, start1, stop1, dirc1 = exon1
        _, start2, stop2, dirc2 = exon2
        dirc = dirc1==dirc2
        left = stop2>start1 and start2<=start1
        right = start2<stop1 and stop2>=stop1
        in1 = start1<=start2 and stop1>=stop2
        in2 = start2<=start1 and stop2>=stop1
        if dirc and (left or right or in1 or in2):
            return 1
        else:
            return 0
    O = list() # overlap matrix
    for i in range(n+1):
        O.append(list())
        for j in range(m+1):
            if i==0 or j==0:
                O[i].append(0)
                continue
            ol = overlap(exon1list[i-1], exon2list[j-1])
            O[i].append(max(O[i-1][j], O[i][j-1], O[i-1][j-1]+ol))
    return (O[-1][-1]*2)/float(n+m)
def exon_confidence(exon1list, exon2list):
    if exon1list==[] or exon2list==[]:
        return 0
    cds1 = sum([exon[2]-exon[1] for exon in exon1list])
    cds2 = sum([exon[2]-exon[1] for exon in exon2list])
    
    n, m = len(exon1list), len(exon2list)
    
    def overlap(exon1, exon2):
        _, start1, stop1, dirc1 = exon1
        _, start2, stop2, dirc2 = exon2
        if dirc1!=dirc2: return 0
        area = min(stop1, stop2)-max(start1, start2)
        if area<0: area = 0
        return area
    O = list() # overlap matrix
    for i in range(n+1):
        O.append(list())
        for j in range(m+1):
            if i==0 or j==0:
                O[i].append(0)
                continue
            ol = overlap(exon1list[i-1], exon2list[j-1])
            O[i].append(max(O[i-1][j], O[i][j-1], O[i-1][j-1]+ol))
            
    return (O[-1][-1]*2)/float(cds1+cds2)
    
def exon_overlap_score(genes):
    overlaps_scores = []
    for i, gene_1 in enumerate(genes):
        for j, gene_2 in enumerate(genes[i:]):
            if i==(j+i): continue
            largest_type_1 = max((len(gene_1.get('cds', [])), 'cds'),
                                 (len(gene_1.get('exon', [])), 'exon'),
                                 key=lambda x: x[0])[1]
            largest_type_2 = max((len(gene_2.get('cds', [])), 'cds'),
                                 (len(gene_2.get('exon', [])), 'exon'),
                                 key=lambda x: x[0])[1]
            score = exon_overlap(gene_1.get(largest_type_1, []),
                                 gene_2.get(largest_type_2, []))
            confidence = exon_confidence(gene_1.get(largest_type_1, []),
                                         gene_2.get(largest_type_2, []))
            overlaps_scores.append((score+confidence)/2)
            #overlaps_scores.append(confidence)
            
    return(sum(overlaps_scores)/len(genes), overlaps_scores)
def which_best_overlap(overlaps):
    scores = [0 for _ in range(len(overlaps))]
    c = 0
    for i in range(len(filenames)):
        for j in range(i, len(filenames)):
            if i==j: continue
            scores[i] += overlaps[c]
            scores[j] += overlaps[c]
            c += 1
    return max(enumerate(scores), key=lambda x: x[1])
class GFFwriter:
    def __init__(self, outfile, version="0.0"):
        self._file = open(outfile, "w")
        self._current_version = version
        self._write_header()
        self._transcipt_counter = 1
        self.IDs = {}
        
    def write_gene(self, gene, confidence_level=0, origin=""):
        lines = gene._lines
        #if "ID" in lines[0][-1]:
        #    ID = {key:value for key, value in [l.split("=") for l in lines[0][-1].split(";")]}['ID']
        #else:
        #    ID = lines[0][-1]
        #if ID not in self.IDs:
            #self.IDs[ID] = 0
            
        self._file.write("## Transcript number {}\n".format(self._transcipt_counter))
        self._file.write("## Confidence level for transcript: {}\n".format(confidence_level))
        self._file.write("## transcript origin: {}\n".format(origin))
        self._transcipt_counter += 1
        for line in lines:
            pasteline = "\t".join(line)
            pasteline = pasteline.replace("\n", "")
            pasteline = pasteline.replace("\r", "")
            pasteline = pasteline+"; confidence_level={}\n".format(confidence_level)
            self._file.write(pasteline)
    def _write_header(self):
        self._file.write("## ANNOTATIONcleaner (v{})\n".format(self._current_version))
def Scan_Queues():
    pass
class CandidateBlock:
    pass
def pretty_print(message, colour="reset"):
    base_color = "\033[39m"
    my_color = {"reset": "\033[39m", "green": "\033[32m",
                "cyan": "\033[96m", "blue": "\033[34m",
                "red": "\033[31m", "lightblue": "\033[38;5;74m",
                "orange": "\033[38;5;202m"}
    print(my_color.get(colour, "\033[39m")+message+base_color)
    
if __name__=="__main__":
    _current_version = "0.4.4"
    usage = '''
    usage: python \033[4m%prog\033[24m \033[38;5;74m[options]\033[39m \033[32m<gff files or gtf files>\033[39m'''
    parser = OptionParser(usage)
    parser.add_option('-R', type="string", nargs=1, dest="RNAdepth",
                      help="RNA read depth file. <Created by samtools depth -aa reads.bam>")
    parser.add_option('-Q', type="int", nargs=1, dest="QueueSize", default=15,
                      help="Maximum Queue size. Ability to look back to previous transcripts. Larger Q increases runtime and space consumption. Also allows for better comparisons of iso-forms. Default is set to 15.")
    parser.add_option('-c', type="string", nargs=1, dest="Cutoff", default="0.5",
                      help="Confidence level cutoff. Default is set to 0.5.")
    parser.add_option('-s', type="string", nargs=1, dest="SizeLimit", default="30000",
                      help="Gene size limit. Default is set to 30 kb (30000).")
    parser.add_option('-o', type="string", nargs=1, dest="Output", default="output.gtf",
                      help="Output file name. Default: output.gtf")
    pretty_print("======= ANNOTATION MERGER (v{}) =======".format(_current_version), "orange")
    
    options, args = parser.parse_args()
    
    _queue_size = options.QueueSize
    _cut_off = float(options.Cutoff)
    _gene_size_limit = float(options.SizeLimit)
    _outfile_name = options.Output
    pretty_print("Queue size set to {} and Cutoff level set to {}".format(_queue_size, _cut_off), "red")
    pretty_print("Output will be saved into: {}".format(_outfile_name), "red")
    
    pretty_print("OPENING GFF/GTF FILES", "lightblue")
    gene_annotations = []
    filenames = []
    for arg in args:
        if ".gff" in arg or ".gtf" in arg:
            filenames.append(arg)
    
    for filename in filenames:
        pretty_print("\t"+filename)
        gene_annotations.append(GFF(filename))
    pretty_print("PROCESSING GFF/GTF FILES", "lightblue")
    empty = [False, False, False]
    current_genes = [ga.get_gene() for ga in gene_annotations]
    #print(test_overlaps(current_genes))
    transcript_counts = [1 for _ in range(len(gene_annotations))]
    all_overlap_count = 0
    any_overlap_count = 0
    exon_overlaps = []
    passed_genes = 0
    outgff = GFFwriter(_outfile_name)
    display_counter = 1
    display_by = 25000
    pairwise_map = {}
    c = 0
    for i in range(len(filenames)):
        for j in range(i, len(filenames)):
            if i==j: continue
            pairwise_map[c] = (i, j)
            c += 1
            
    ## MAIN SCRIPT
    while (not all(empty)):
        if (display_counter % display_by)==0:
            print("\t{} transcripts read through".format(display_counter))
        overlaps = test_overlaps(current_genes)
        if all(overlaps):
            all_overlap_count += 1
        if any(overlaps):
            any_overlap_count += 1
            score, pair_scores = exon_overlap_score(current_genes)
            if any(score>=_cut_off for score in pair_scores):
                if all(overlaps):
                    bestgene = 0
                    outgff.write_gene(current_genes[bestgene], score, filenames[bestgene])
                else:
                    #which_max, score = max(enumerate(pair_scores), key=lambda x: x[1])
                    #pairs = pairwise_map[which_max]
                    #bestgene = min(pairs)
                    bestgene = which_best_overlap(pair_scores)[0]
                    #bestgene = max(pairs, key=lambda x: len(current_genes[x]))
                    #print pairs
                    #print len(current_genes[pairs[0]]), len(current_genes[pairs[1]])
                    #print bestgene
                    outgff.write_gene(current_genes[bestgene], score, filenames[bestgene])
                    passed_genes += 1
                #if not gene_annotations[bestgene]._empty:
                #    current_genes[bestgene] = gene_annotations[bestgene].get_gene()
                #    while len(current_genes[bestgene])>_gene_size_limit:
                #        current_genes[bestgene] = gene_annotations[bestgene].get_gene()
                #    empty[bestgene] = gene_annotations[bestgene]._empty
                #    transcript_counts[bestgene] += 1
                #    display_counter += 1
                #continue
        lagging_gene = get_furthest_gene(current_genes, empty)
        current_genes[lagging_gene] = gene_annotations[lagging_gene].get_gene()
        while len(current_genes[lagging_gene])>_gene_size_limit:
            current_genes[lagging_gene] = gene_annotations[lagging_gene].get_gene()
        empty[lagging_gene] = gene_annotations[lagging_gene]._empty
        transcript_counts[lagging_gene] += 1
        display_counter += 1
    pretty_print("FILE STATISTICS", "lightblue")
    
    pretty_print("\tAny genes overlap: {}".format(any_overlap_count))
    pretty_print("\tAll genes overlap: {}".format(all_overlap_count))
    pretty_print("\tAverage exon overlap: {}".format(sum(exon_overlaps)/all_overlap_count))
    pretty_print("\tPassed genes: {}".format(passed_genes))
    for filename, tc in zip(filenames, transcript_counts):
         pretty_print("\t{} gene count: {}".format(filename, tc))
    percentage = float(all_overlap_count*len(filenames))/float(sum(transcript_counts))
    pretty_print("\tPercentage all overlap: {} %".format(percentage*100))
    #for ga in gene_annotations:
    #    print(ga._Queue.get(1))
    #for ga in gene_annotations:
    #    print(ga._empty)

Takes blast results against som database in format: -outfmt '7 qseqid evalue bitscore score stitle' and finds matching queries. It picks the top blast result as a best guess.

from optparse import OptionParser
import re
class FASTA:
    def __init__(self, filename):
        self._file = open(filename)
        self._empty = False
    def read_sequence(self):
        if self._empty: return None
        inseq = False
        header = ''
        sequence = ''
        while True:
            last_line = self._file.tell()
            line = self._file.readline()
            if line=='':
                self._empty = True
                break
            if line[0]==";": continue
            if line[0]=='>' and inseq:
                self._file.seek(last_line)
                break
            if line[0]=='>' and not inseq:
                header = line[1:-1]
                inseq = True
            if line[0]!='>' and inseq: sequence += line[:-1]
        return(header, sequence)
    def empty(self):
        return self._empty
class BLAST:
    def __init__(self, filename):
        self._file = open(filename)
        self._empty = False
        self._Fields = None
    def read_query(self):
        if self._empty: return None
        in_query = False
        query_name = None
        Matches = []
        while True:
            last_line = self._file.tell()
            line = self._file.readline()
            if line=='':
                self._empty = True
                break
            if line[0]=="#":
                if 'Fields' in line and self._Fields is None:
                    line = line[:-1].split(": ")[-1]
                    self._Fields = line.split(", ")
                continue
            elements = line[:-1].split("\t")
            for i in range(len(elements)):
                element = elements[i]
                try:
                    element = int(element)
                    elements[i] = element
                except:
                    try:
                        element = float(element)
                        elements[i] = element
                    except:
                        pass
            if in_query:
                Matches.append({field:element for field, element in zip(self._Fields, elements)})
                if query_name != elements[0]:
                    self._file.seek(last_line)
                    break
            else:
                query_name = elements[0]
                Matches.append({field:element for field, element in zip(self._Fields, elements)})
                in_query = True
        return query_name, Matches
    def empty(self):
        return self._empty
    
def pretty_print(message, colour="reset"):
    base_color = "\033[39m"
    my_color = {"reset": "\033[39m", "green": "\033[32m",
                "cyan": "\033[96m", "blue": "\033[34m",
                "red": "\033[31m", "lightblue": "\033[38;5;74m",
                "orange": "\033[38;5;202m"}
    print(my_color.get(colour, "\033[39m")+message+base_color)
    
if __name__=="__main__":
    _current_version = "0.1.5"
    usage = '''
    usage: python \033[4m%prog\033[24m \033[38;5;74m[options]\033[39m \033[32m<blast result files>\033[39m'''
    parser = OptionParser(usage)
    parser.add_option('-q', type="string", nargs=1, dest="Queries", default=None,
                      help="Input query filename, the sequence used for the blast search. (FASTA format)")
    parser.add_option('-o', type="string", nargs=1, dest="Output", default="output.fasta",
                      help="Output file name. Default: output.fasta")
    parser.add_option('-s', type="string", nargs=1, dest="Summary", default="summary.csv",
                      help="Output summary csv file. Keeps track of the functions given to all transcripts. Default: summary.csv")
    parser.add_option('-e', type="float", nargs=1, dest="Evalue", default=1,
                      help="Evalue cutoff. Default 1")
    pretty_print("======= BLAST ANNOTATER (v{}) =======".format(_current_version), "orange")
    
    options, args = parser.parse_args()
    if options.Queries is None:
        raise "No query file given, please use: -q filename.fasta"
    query = options.Queries
    blast_filename = args[0]
    outfile = options.Output
    summary_file = options.Summary
    eval_cutoff = options.Evalue
    pretty_print("OPENING FILES", "lightblue")
    
    query_file = FASTA(query)
    blast_file = BLAST(blast_filename)
    # araprot regex (\|.+?\|)
    stringmatcher = re.compile(".+?\|.+?\|")
    hexnumbers = re.compile("%..")
    #query1, matches1 = blast.read_query()
    #query2, matches2 = blast.read_query()
    #matches1 = filter(lambda x: x['evalue']<eval_cutoff, matches1)
    #matches2 = filter(lambda x: x['evalue']<eval_cutoff, matches2)
    
    #print len(matches1)
    #print len(matches2)
    #for match in matches1:
    #    print match['subject title']
    #    print stringmatcher.findall(match['subject title'])[0]
    pretty_print("READING BLAST FILE", "lightblue")
    blast_queries = {}
    while not blast_file.empty():
        blast_query, matches = blast_file.read_query()
        blast_queries[blast_query] = matches[0]
    
    pretty_print("READING AND WRITING FASTA FILE", "lightblue")
    
    f = open(outfile, "w")
    s = open(summary_file, "w")
    s.write("transcript;function\n")
    while not query_file.empty():
        gene_name, sequence = query_file.read_sequence()
        gene_name = gene_name.split(" ")[0]
        if gene_name in blast_queries:
            function = stringmatcher.findall(blast_queries[gene_name]['subject title'])[0][:-1]
            if "%" in function:
                hex_values = hexnumbers.findall(function)
                for hex_value in hex_values:
                    function = function.replace(hex_value, hex_value[1:].decode("hex"))
        else:
            function = "NO MATCH"
        s.write(gene_name+";"+function+"\n")
        f.write(">"+gene_name+" | "+function+"\n")
        for i in range((len(sequence)/60)+1):
            f.write(sequence[(i*60):((i+1)*60)])
            f.write("\n")
            
    f.close()
    s.close()
    pretty_print("FINISHED FILES", "lightblue")
    #for match in matches2:
    #    print match
    #print [open(blast_filename).read()]

Using the summary csv produced by BLASTannotater.py, it can add function and with query it matched to a gtf file in the attributes column.

from optparse import OptionParser
def read_summary(summary_file):
    f = open(summary_file)
    summary_data = {}
    f.readline()
    for line in f:
        name, tag = line.split(";", 1)
        summary_data[name] = {}
        tag = tag.replace("\n", "")
        if tag=="NO MATCH":
            summary_data[name]["transcript"] = tag
            summary_data[name]["function"] = "UNKNOWN"
            continue
        transcript, function = tag.split(" | ", 1)
        summary_data[name]["transcript"] = transcript
        summary_data[name]["function"] = function
    return summary_data
def annotate_gff(gff_file, outfile, summary_data):
    gff = open(gff_file)
    gff.readline()
    out = open(outfile, "w")
    for line in gff:
        if line[0]=="#":
            out.write(line)
            continue
        line = line.replace("\n", "")
        line_info = line.split("\t")
        element_type, INFO = line_info[2], line_info[-1]
        if element_type=="transcript":
            INFO = INFO.split(";")
            transcript = INFO[0]
            function = summary_data[transcript]['function']
            matching_query = summary_data[transcript]['transcript']
            compiled_info = {}
            order = []
            for entry in INFO[1:]:
                #print entry
                #print "=" in entry
                if entry == "": continue
                if "=" in entry:
                    key, val = entry.split("=")
                    key.replace(" ", "")
                    compiled_info[key] = val
                    order.append(key)
                else:
                    key, val = entry.split('"', 1)
                    key.replace(" ", "")
                    val = val.replace('"', "")
                    compiled_info[key] = val
                    order.append(key)
            attribute = transcript+"; "
            for key in order:
                attribute += key+' "'+compiled_info[key]+'"; '
            attribute += 'function "'+function+'"; '
            attribute += 'tair_accession "'+matching_query+'"\n'
            
            out.write("\t".join(line_info[:-1])+"\t"+attribute)
        else:
            INFO = INFO.split(";")
            compiled_info = {}
            order = []
            for entry in INFO:
                #print entry
                if entry == "": continue
                if "=" in entry:
                    key, val = entry.split("=")
                    key.replace(" ", "")
                    compiled_info[key] = val
                    order.append(key)
                else:
                    key, val = entry.split('"', 1)
                    key = key.replace(" ", "")
                    val = val.replace('"', "")
                    compiled_info[key] = val
                    order.append(key)
            transcript = compiled_info["transcript_id"]
            function = summary_data[transcript]['function'][:-1]
            matching_query = summary_data[transcript]['transcript']
            attribute = ""
            for key in order:
                attribute += key+' "'+compiled_info[key]+'"; '
            attribute += 'function "'+function+'"; '
            attribute += 'tair_accession "'+matching_query+'"\n'
            
            out.write("\t".join(line_info[:-1])+"\t"+attribute)
            
def pretty_print(message, colour="reset"):
    base_color = "\033[39m"
    my_color = {"reset": "\033[39m", "green": "\033[32m",
                "cyan": "\033[96m", "blue": "\033[34m",
                "red": "\033[31m", "lightblue": "\033[38;5;74m",
                "orange": "\033[38;5;202m"}
    print(my_color.get(colour, "\033[39m")+message+base_color)
if __name__=="__main__":
    _current_version = "0.1"
    usage = '''
    usage: python \033[4m%prog\033[24m \033[38;5;74m[options]\033[39m \033[32m<gff file>\033[39m'''
    parser = OptionParser(usage)
    parser.add_option('-o', type="string", nargs=1, dest="Output", default="output.gff", help="Output file name. default: output.gff")
    parser.add_option('-s', type="string", nargs=1, dest="CSV", help="Summary csv file, containing annotations, produced by BLASTannotater.py (Required)")
    pretty_print("======= GFF add info (v{}) =======".format(_current_version), "orange")
    options, args = parser.parse_args()
    if len(args)>0:
        gff_file = args[0]
    else:
        raise "Missing argument of gff_file"
    output = options.Output
    if options.CSV==None:
        raise "Missing summary csv file, please provide it using -s <filename.csv>"
    else:
        summary_file = options.CSV
    pretty_print("READING SUMMARY FILE", "cyan")
    summary_data = read_summary(summary_file)
    pretty_print("ANNOTATING GFF FILE", "cyan")
    annotate_gff(gff_file, output, summary_data)

HyLiTE pipeline

HyLiTE pipeline for running on the all of the biological and replicate samples for the RNAseq. The gwf pipeline for the HyLiTE workflow. Reusable if the sample dictionaries at the top and replaced by the new samples.

from gwf import Workflow
gwf = Workflow()
## Reference is a fasta file of the genes.
def bowtie2_index(reference, reference_name):
    inputs = [reference]
    outputs = [reference_name+".1.bt2"]
    options = {
        'cores': 1,
        'memory': '16g',
        'account': 'NChain',
        'walltime': '12:00:00'
    }
    spec = '''
    source activate HyLiTE
    bowtie2-build {ref} {ref_n}
    '''.format(ref=reference, ref_n=reference_name)
    return inputs, outputs, options, spec
def star_index(reference, output):
    inputs = [reference]
    outputs = [output]
    options = {"cores":8, "memory":"64g", "account":"NChain", "walltime": "12:00:00"}
    directory = "/".join(output.split("/")[:-1])
    spec = """
    source /com/extra/STAR/2.5.2b/load.sh
    STAR --runMode genomeGenerate --runThreadN 8 --genomeDir {dir} --genomeFastaFiles {ref} --limitGenomeGenerateRAM=64000000000 --genomeSAindexNbases 3
    """.format(ref=reference, dir=directory)
    return inputs, outputs, options, spec
reference_file = "./references/To/To.v5.gDNA.fasta"
index_ref_file = "./references/To/To.ref"
raw_gDNA = {"To": ["/home/marnit/NChain/faststorage/20181120_clover_180bp_gDNA/Trifolium_occidentale_180bp_1.fastq.gz",
                   "/home/marnit/NChain/faststorage/20181120_clover_180bp_gDNA/Trifolium_occidentale_180bp_2.fastq.gz"],
            "Tp": ["/home/marnit/NChain/faststorage/20181120_clover_180bp_gDNA/Trifolium_pallescens_180_1.fastq.gz",
                   "/home/marnit/NChain/faststorage/20181120_clover_180bp_gDNA/Trifolium_pallescens_180_2.fastq.gz"],
            "TrR": ["/home/marnit/NChain/faststorage/20181120_clover_180bp_gDNA/Trifolium_repens_180bp_1.fastq.gz",
                    "/home/marnit/NChain/faststorage/20181120_clover_180bp_gDNA/Trifolium_repens_180bp_2.fastq.gz"]}
raw_RNA_old = {"To": {"floral": ["../BRAKER_ANNOTATION/pipeline/RNAdata/occidentale/To_F2_pooled_1.fastq",
                             "../BRAKER_ANNOTATION/pipeline/RNAdata/occidentale/To_F2_pooled_2.fastq"],
                  "leaf": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/old_rnaseq/To/To_1_1.fastq",
                           "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/old_rnaseq/To/To_1_2.fastq"],
                  "stolon": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/old_rnaseq/To/To_3_1.fastq",
                             "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/old_rnaseq/To/To_3_2.fastq"],
                  "root": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/old_rnaseq/To/To_6_1.fastq",
                           "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/old_rnaseq/To/To_6_2.fastq"]},
           "Tp": {"floral": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/old_rnaseq/Tp/Tp_F1_pooled_1.fastq",
                             "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/old_rnaseq/Tp/Tp_F1_pooled_2.fastq"],
                  "leaf": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/old_rnaseq/Tp/Tp_L4_1_1.fastq",
                           "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/old_rnaseq/Tp/Tp_L4_1_2.fastq"],
                  "stolon": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/old_rnaseq/Tp/T_pal_stolon_1_1_9_1.fastq",
                             "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/old_rnaseq/Tp/T_pal_stolon_1_1_9_2.fastq"],
                  "root": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/old_rnaseq/Tp/Tp_root_3e1_47_1.fastq",
                           "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/old_rnaseq/Tp/Tp_root_3e1_47_2.fastq"]},
           "TrR": {"floral": ["../BRAKER_ANNOTATION/pipeline/RNAdata/repens/floral/TRfloral_1.fastq",
                              "../BRAKER_ANNOTATION/pipeline/RNAdata/repens/floral/TRfloral_2.fastq"],
                   "leaf": ["../BRAKER_ANNOTATION/pipeline/RNAdata/repens/leaf/TRleaf_1.fastq",
                            "../BRAKER_ANNOTATION/pipeline/RNAdata/repens/leaf/TRleaf_2.fastq"],
                   "stolon": ["../BRAKER_ANNOTATION/pipeline/RNAdata/repens/stolon/TRstolon_1.fastq",
                              "../BRAKER_ANNOTATION/pipeline/RNAdata/repens/stolon/TRstolon_2.fastq"],
                   "root": ["../BRAKER_ANNOTATION/pipeline/RNAdata/repens/root/TRroot_1.fastq",
                            "../BRAKER_ANNOTATION/pipeline/RNAdata/repens/root/TRroot_2.fastq"]}}
raw_RNA = {"To": {"YL1": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_1_YL_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_1_YL_2.fq.gz"],
                  "YL2": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_2_YL_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_2_YL_2.fq.gz"],
                  "YL3": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_3_YL_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_3_YL_2.fq.gz"],
                  "OL1": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_1_OL_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_1_OL_2.fq.gz"],
                  "OL2": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_2_OL_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_2_OL_2.fq.gz"],
                  "OL3": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_3_OL_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_3_OL_2.fq.gz"],
                  "St1": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_1_St_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_1_St_2.fq.gz"],
                  "St2": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_2_St_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_2_St_2.fq.gz"],
                  "St3": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_3_St_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_3_St_2.fq.gz"],
                  "R1": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_1_R_1.fq.gz",
                         "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_1_R_2.fq.gz"],
                  "R2": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_2_R_1.fq.gz",
                         "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_2_R_2.fq.gz"],
                  "R3": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_3_R_1.fq.gz",
                         "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/To_3_R_2.fq.gz"]},
           "Tp": {"YL1": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_1_YL_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_1_YL_2.fq.gz"],
                  "YL2": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_2_YL_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_2_YL_2.fq.gz"],
                  "YL3": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_3_YL_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_3_YL_2.fq.gz"],
                  "OL1": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_1_OL_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_1_OL_2.fq.gz"],
                  "OL2": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_2_OL_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_2_OL_2.fq.gz"],
                  "OL3": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_3_OL_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_3_OL_2.fq.gz"],
                  "St1": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_1_St_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_1_St_2.fq.gz"],
                  "St2": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_2_St_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_2_St_2.fq.gz"],
                  "St3": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_3_St_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_3_St_2.fq.gz"],
                  "R1": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_1_R_1.fq.gz",
                         "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_1_R_2.fq.gz"],
                  "R2": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_2_R_1.fq.gz",
                         "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_2_R_2.fq.gz"],
                  "R3": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_3_R_1.fq.gz",
                         "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/Tp_3_R_2.fq.gz"]},
           "S9": {"F1": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_108_F_1.fq.gz",
                         "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_108_F_2.fq.gz"],
                  "F2": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_60_F_1.fq.gz",
                         "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_60_F_2.fq.gz"],
                  "F3": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_86_F_1.fq.gz",
                         "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_86_F_2.fq.gz"],
                  "L1": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_108_L_1.fq.gz",
                         "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_108_L_2.fq.gz"],
                  "L2": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_60_L_1.fq.gz",
                         "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_60_L_2.fq.gz"],
                  "L3": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_86_L_1.fq.gz",
                         "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_86_L_2.fq.gz"],
                  "St1": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_108_St_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_108_St_2.fq.gz"],
                  "St2": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_60_St_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_60_St_2.fq.gz"],
                  "St3": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_86_St_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_86_St_2.fq.gz"],
                  "R1": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_108_R_1.fq.gz",
                         "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_108_R_2.fq.gz"],
                  "R2": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_60_R_1.fq.gz",
                         "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_60_R_2.fq.gz"],
                  "R3": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_86_R_1.fq.gz",
                         "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S9_86_R_2.fq.gz"]},
           "S10": {"St1": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_1_St_1.fq.gz",
                           "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_1_St_2.fq.gz"],
                   "St2": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_2_St_1.fq.gz",
                           "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_2_St_2.fq.gz"],
                   "St3": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_3_St_1.fq.gz",
                           "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_3_St_2.fq.gz"],
                   "YL1": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_1_YL_1.fq.gz",
                           "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_1_YL_2.fq.gz"],
                   "YL2": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_2_YL_1.fq.gz",
                           "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_2_YL_2.fq.gz"],
                   "YL3": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_3_YL_1.fq.gz",
                           "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_3_YL_2.fq.gz"],
                   "OL1": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_1_OL_1.fq.gz",
                           "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_1_OL_2.fq.gz"],
                   "OL2": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_2_OL_1.fq.gz",
                           "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_2_OL_2.fq.gz"],
                   "OL3": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_3_OL_1.fq.gz",
                           "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_3_OL_2.fq.gz"],
                   "R1": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_1_R_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_1_R_2.fq.gz"],
                   "R2": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_2_R_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_2_R_2.fq.gz"],
                   "R3": ["/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_3_R_1.fq.gz",
                          "/home/marnit/NChain/faststorage/WHITE_CLOVER/20181211_RNASEQ/new_rnaseq/raw_data/S10_3_R_2.fq.gz"]}}
all_species_gDNA = ["To", "Tp", "TrR"]
all_species_RNA = ["To", "Tp", "S9", "S10"]
tissues_old = ["floral", "leaf", "stolon", "root"]
tissues = {"To": ["St1", "St2", "St3", "R1", "R2", "R3", "YL1", "YL2", "YL3", "OL1", "OL2", "OL3"],
           "Tp": ["St1", "St2", "St3", "R1", "R2", "R3", "YL1", "YL2", "YL3", "OL1", "OL2", "OL3"],
           "S9": ["F1", "F2", "F3", "St1", "St2", "St3", "R1", "R2", "R3", "L1", "L2", "L3"],
           "S10": ["St1", "St2", "St3", "R1", "R2", "R3", "YL1", "YL2", "YL3", "OL1", "OL2", "OL3"]}
gwf.target_from_template("ToIndex",
                         bowtie2_index(reference_file, index_ref_file))
def star_mapping(read1, output, output_prefix, genomeDir):
    inputs = [read1, genomeDir+"/genomeParameters.txt"]
    outputs = [output]
    options = {
    "cores": 8,
    "memory": "64g",
    "account": "NChain",
    "walltime": "12:00:00"}
    OFilePrefix = output_prefix
    spec = """
    source /com/extra/STAR/2.5.2b/load.sh
    STAR --runThreadN 8 --genomeDir {dir} --outSAMtype BAM SortedByCoordinate --outFileNamePrefix {of} --readFilesIn {r1} --limitGenomeGenerateRAM=64000000000""".format(r1 = read1, of=output_prefix, dir=genomeDir)
    if read1.split(".")[-1]=="gz":
        spec += " --readFilesCommand zcat"
    return inputs, outputs, options, spec
def bowtie2_mapping(read1, output, genomeDir):
    inputs = [read1, genomeDir+".1.bt2"]
    outputs = [output]
    options = {
    "cores": 8,
    "memory": "24g",
    "account": "NChain",
    "walltime": "12:00:00"}
    spec = """
    source activate HyLiTE
    bowtie2 -N 1 -x {dir} -U {reads} -S {sam} --local -p {cores}
    """.format(reads = read1, sam=output, dir=genomeDir, cores=options["cores"])
    return inputs, outputs, options, spec
for species in all_species_gDNA:
    for i, readfile in enumerate(raw_gDNA[species]):
        gwf.target_from_template(species+"gDNAmap"+str(i+1),
                                 bowtie2_mapping(readfile, "./gDNA/{s}/{s}.{i}.gDNA.sam".format(s=species, i=i+1),
                                                 index_ref_file))
for species in all_species_RNA:
    for tissue in tissues[species]:
        for i, readfile in enumerate(raw_RNA[species][tissue]):
            gwf.target_from_template(species+"_"+tissue+"_"+"RNAmap"+str(i+1),
                                     bowtie2_mapping(readfile, "./RNA/{s}/{s}.{t}.{i}.gDNA.sam".format(s=species, t=tissue, i=i+1),
                                                     index_ref_file))
def samtools_merge(infile1, infile2, outfile):
    inputs = [infile1, infile2]
    outputs = [outfile]
    options = {
        'cores': 1,
        'memory': '2g',
        'account': 'NChain',
        'walltime': '12:00:00'
    }
    spec = """
    source /com/extra/samtools/1.3/load.sh
    samtools merge {} {} {}
    """.format(outfile, infile1, infile2)
    return inputs, outputs, options, spec
for species in all_species_gDNA:
    gwf.target_from_template(species+"gDNAmerge",
                             samtools_merge("./gDNA/{s}/{s}.{i}.gDNA.sam".format(s=species, i=1),
                                            "./gDNA/{s}/{s}.{i}.gDNA.sam".format(s=species, i=2),
                                            "./gDNA/{s}/{s}.gDNA.sam".format(s=species)))
for species in all_species_RNA:
    for tissue in tissues[species]:
        gwf.target_from_template(species+"_"+tissue+"_"+"RNAmerge",
                                 samtools_merge("./RNA/{s}/{s}.{t}.{i}.gDNA.sam".format(s=species, t=tissue, i=1),
                                                "./RNA/{s}/{s}.{t}.{i}.gDNA.sam".format(s=species, t=tissue, i=2),
                                                "./RNA/{s}/{s}.{t}.gDNA.sam".format(s=species, t=tissue)))
def samtools_process(infile, outfile):
    inputs = [infile]
    outputs = [outfile]
    options = {
        'cores': 1,
        'memory': '2g',
        'account': 'NChain',
        'walltime': '12:00:00'
    }
    spec = """
    source /com/extra/samtools/1.3/load.sh
    #samtools view -Sb {infile} -o {infile}.bam
    samtools sort {infile} -o {outfile}
    samtools index {outfile}
    """.format(infile=infile, outfile=outfile)
    return inputs, outputs, options, spec
for species in all_species_gDNA:
    gwf.target_from_template(species+"gDNAsort",
                             samtools_process("./gDNA/{s}/{s}.gDNA.sam".format(s=species),
                                            "./gDNA/{s}.gDNA.s.bam".format(s=species)))
for species in all_species_RNA:
    for tissue in tissues[species]:
        gwf.target_from_template(species+"_"+tissue+"_"+"RNAsort",
                                 samtools_process("./RNA/{s}/{s}.{t}.gDNA.sam".format(s=species, t=tissue),
                                                "./RNA/{s}.{t}.RNA.s.bam".format(s=species, t=tissue)))
def make_protocol_file(in_files, individual, samples, ploidy, filetype, outfile):
    inputs = in_files
    outputs = [outfile]
    options = {
        'cores': 1,
        'memory': '1g',
        'account': 'NChain',
        'walltime': '01:00:00'
    }
    spec = ''
    for inf, ind, sample, p, f in zip(in_files, individual, samples, ploidy, filetype):
        spec += 'echo "'+"\t".join([ind, p, sample, f, inf])+'" >> '+outfile+"\n"
    print(spec)
    return inputs, outputs, options, spec
Name = {"To": "P1", "Tp": "P2", "TrR": "Ch"}
Ploidy = {"To": "1", "Tp": "1", "TrR": "2"}
bamfiles = []
individual = []
ploidy_levels = []
seq_type = []
#### MAKE SAMPLES LIST
samples = []
translate = {"To": "To", "Tp": "Tp",
             "S9": "TrR", "S10": "TrR"}
samples_map = {"To": [["St1", "R1", "YL1", "OL1"],
                      ["St2", "R2", "YL2", "OL2"],
                      ["St3", "R3", "YL3", "OL3"]],
               "Tp": [["St1", "R1", "YL1", "OL1"],
                      ["St2", "R2", "YL2", "OL2"],
                      ["St3", "R3", "YL3", "OL3"]],
               "S10": [["St1", "R1", "YL1", "OL1"],
                      ["St2", "R2", "YL2", "OL2"],
                      ["St3", "R3", "YL3", "OL3"]],
               "S9": [["F1", "St1", "R1", "L1"],
                      ["F2", "St2", "R2", "L2"],
                      ["F3", "St3", "R3", "L3"]]}
#species_included = {"To": False, "Tp": False, "TrR": False}
for species in [["S9", "S10"], "To", "Tp"]:
    if type(species)==list:
        sample_n = 1
        for specie in species:
            #print(specie, samples_map[specie])
            for current_samples in samples_map[specie]:
                for tissue in current_samples:
                    #print(specie, tissue)
                    bamfiles.append("./RNA/{s}.{t}.RNA.s.bam".format(s=specie, t=tissue))
                    seq_type.append("RNAseq")
                    individual.append(Name[translate[specie]])
                    ploidy_levels.append(Ploidy[translate[specie]])
                    samples.append("sample"+str(sample_n))
                    sample_n += 1
        for specie in [species[0]]:
            bamfiles.append("./gDNA/{s}.gDNA.s.bam".format(s=translate[specie]))
            seq_type.append("gDNA")
            individual.append(Name[translate[specie]])
            ploidy_levels.append(Ploidy[translate[specie]])
            samples.append("sample"+str(sample_n))
            sample_n += 1
    else:
        sample_n = 1
        for current_samples in samples_map[species]:
            for tissue in current_samples:
                bamfiles.append("./RNA/{s}.{t}.RNA.s.bam".format(s=species, t=tissue))
                seq_type.append("RNAseq")
                individual.append(Name[translate[species]])
                ploidy_levels.append(Ploidy[translate[species]])
                samples.append("sample"+str(sample_n))
                sample_n += 1
        bamfiles.append("./gDNA/{s}.gDNA.s.bam".format(s=translate[species]))
        seq_type.append("gDNA")
        individual.append(Name[translate[species]])
        ploidy_levels.append(Ploidy[translate[species]])
        samples.append("sample"+str(sample_n))
#print(bamfiles)
#print(individual)
#print(ploidy_levels)
#print(seq_type)
gwf.target_from_template("HyLiTEprotocolfile",
                         make_protocol_file(bamfiles,
                                            individual,
                                            samples,
                                            ploidy_levels,
                                            seq_type,
                                            "repens_protocol.txt"))
def make_bam_file(in_files,outfile):
    inputs = in_files
    outputs = [outfile]
    options = {
        'cores': 1,
        'memory': '1g',
        'account': 'NChain',
        'walltime': '01:00:00'
    }
    spec = ""
    for ind in in_files:
        spec += 'echo "'+ind+'" >> '+outfile+"\n"
    return inputs, outputs, options, spec
gwf.target_from_template("mpileup_bamfile",
                         make_bam_file(bamfiles, "bamfiles.txt"))
def mpileup(reference, bamfiles, outfile):
    inputs = [reference, bamfiles]
    outputs = [outfile]
    options = {
        'cores': 1,
        'memory': '24g',
        'account': 'NChain',
        'walltime': '48:00:00'
    }
    spec = '''
    source /com/extra/samtools/1.3/load.sh
    samtools mpileup -BQ 0 -d 1000000 -f {ref} -b {bamfiles} > {outfile}
    '''.format(ref=reference, bamfiles=bamfiles, outfile=outfile)
    return inputs, outputs, options, spec
gwf.target_from_template("mpileup",
                         mpileup(reference_file, "bamfiles.txt",
                                 "repens_combo.pileup"))
def fix_mpileup(pileup_file, outfile):
    inputs = [pileup_file]
    outputs = [outfile]
    options = {
        'cores': 1,
        'memory': '2g',
        'account': 'NChain',
        'walltime': '12:00:00'
    }
    spec = '''
    python mpileupfix.py {pileup} > {out}
    '''.format(pileup=pileup_file, out=outfile)
    return inputs, outputs, options, spec
gwf.target_from_template("mpileupfix",
                         fix_mpileup("repens_combo.pileup", "repens_fixed.pileup"))
def run_HyLiTE(reference, protocol, pileup_file):
    inputs = [reference, protocol, pileup_file]
    outputs = ["HyLiTE_output/HyLiTE_output.expression.txt"]
    options = {
        'cores': 1,
        'memory': '124g',
        'account': 'NChain',
        'walltime': '64:00:00'
    }
    spec = '''
    source activate HyLiTE
    HyLiTE -v -f {proto} -p {pileup} -n HyLiTE_output -r {ref}
    '''.format(ref=reference, proto=protocol, pileup=pileup_file)
    return inputs, outputs, options, spec
def HacknHyLiTE(reference, protocol, pileup_file, segments):
    inputs = [reference, protocol, pileup_file]
    outputs = []
    options = {
        'cores': 1,
        'memory': '12g',
        'account': 'NChain',
        'walltime': '12:00:00'
    }
    for i in range(segments):
        outputs.append("./slicing_directories/slicing.subset{}.sh".format(i))
    spec = '''
    source activate HyLiTE
    HacknHyLiTE -n {seg} -o slicing_directories --name slicing -f {proto} -p {pileup} --options "-r {ref}"
    '''.format(seg=segments, ref=reference, proto=protocol, pileup=pileup_file)
    return inputs, outputs, options, spec
#gwf.target_from_template("HyLiTE",
#                         run_HyLiTE(reference_file, "repens_protocol.txt", "repens_fixed.pileup"))
n_segments = 40
gwf.target_from_template("HacknHyLiTE",
                         HacknHyLiTE(reference_file, "repens_protocol.txt", "repens_fixed.pileup",
                                     n_segments))
subsets = []
subsets_out = []
for i in range(n_segments):
    subsets.append("./slicing_directories/slicing.subset{}.sh".format(i))
    subsets_out.append("./slicing_directories/subset{}/slicing.subset{}.expression.txt".format(i, i))
def run_subset(subset, output):
    inputs = [subset]
    outputs = [output]
    options = {
        'cores': 1,
        'memory': '24g',
        'account': 'NChain',
        'walltime': '48:00:00'
    }
    spec = '''
    source activate HyLiTE
    bash {subset}
    '''.format(subset=subset)
    return inputs, outputs, options, spec
for i, subset in enumerate(zip(subsets, subsets_out)):
    gwf.target_from_template("Subset"+str(i),
                             run_subset(subset[0], subset[1]))
def mergeHyLiTE(subset_out, directory, reference):
    inputs = subset_out
    outputs = []
    options = {
        'cores': 1,
        'memory': '12g',
        'account': 'NChain',
        'walltime': '48:00:00'
    }
    spec = '''
    source activate HyLiTE
    cd {dir}
    HyLiTE-merge -r {ref}
    '''.format(dir=directory, ref=reference)
    return inputs, outputs, options, spec
gwf.target_from_template("mergeHyLiTE",
                         mergeHyLiTE(subsets_out, "./slicing_directories/", "."+reference_file))

Script for fixing ambiguous nucleotides in the mpileup file. Each ambiguous nucleotide is replaced by N. HyLiTE in the current (as of writting this), does not support ambiguous nucleotides.

from sys import argv, stdout
def read_and_fix_mpileup(filename, alphabet):
    f = open(filename)
    for line in f:
        line = line.split("\t")
        if line[0]=="retro": continue
        if line[2] not in alphabet: line[2] = "N"
        stdout.write("\t".join(line))
    f.close()
if __name__=="__main__":
    read_and_fix_mpileup(argv[1], "AGTCN")

A script to properly merge subsets of HyLiTE, when they have been split using HacknHyLiTE. The command which comes mergeHyLiTE command that comes with HyLiTE, simple concatenates the files instead of merging them.

from os.path import isfile, join
from glob import glob
from sys import argv
from optparse import OptionParser
def get_unique_samples(filelist):
    sample_names = set()
    for name in filelist:
        name = name.split("Ch.")[-1]
        name = name.split(".")[0]
        sample_names.add(name)
    return sample_names
def which_sample(name, samples):
    for sample in samples:
        if sample+"." in name:
            return sample
    return None
def group_by_sample(filelist, samples):
    new_group = {}
    for sample in samples:
        new_group[sample] = []
    for name in filelist:
        print(name)
        new_group[which_sample(name, samples)].append(name)
    return new_group
        
def joinfiles(filelist, filename):
    newf = open(filename, "w")
    header = None
    for name in filelist:
        f = open(name)
        if header is None:
            header = f.readline()
            newf.write(header)
        else:
            blank = f.readline()
            if header != blank:
                print("Mismatched header! file: {}".format(name))
        newf.write(f.read())
        f.close()
    newf.close()
def process_table(filename):
    f = open(filename)
    header = f.readline()
    header = header.replace("\n", "")
    header = header.split("\t")
    table = {}
    order = []
    
    for line in f:
        line = line.replace("\n", "")
        line = line.split("\t")
        gene = line[0]
        order.append(gene)
        table[gene] = {}
        for key, value in zip(header, line):
            try:
                value = int(value)
            except:
                value = value
            table[gene][key] = value
    return table, order, header
def sum_rows(row1, row2):
    keys = row1.keys()
    newrow = {}
    for key in keys:
        if key=='GENE':
            newrow[key] = row1[key]
            continue
        newrow[key] = row1[key]+row2[key]
    return newrow
def row_sum(row):
    total = 0
    for value in row.values():
        if type(value)==int:
            total += value
    return total
        
def mergefiles(filelist, newfile):
    newf = open(newfile, "w")
    basename = filelist[0]
    table, order, header = process_table(basename)
    for name in filelist[1:]:
        new_table, _, _ = process_table(name)
        for gene in order:
            table[gene] = sum_rows(table[gene], new_table[gene])
    newf.write("\t".join(header)+"\n")
    for gene in order:
        row = []
        for name in header:
            row.append(str(table[gene][name]))
        newf.write("\t".join(row)+"\n")
    newf.close()
def read_protocol_file(filename):
    f = open(filename)
    info = {}
    for line in f:
        line = line.split("\t")
        name, _, sample, _, which = line
        if name not in info:
            info[name] = {}
        info[name][sample] = which
    return info
def extend_expression_information(filename, sample_info, proto_info, outfile):
    expression_table, gene_order, old_header = process_table(filename)
    P1_name = proto_info["P1"]["sample1"].split("/")[-1].split(".")[0]
    P2_name = proto_info["P2"]["sample1"].split("/")[-1].split(".")[0]
    
    new_header = []
    translator = {}
    sample_names = {}
    sample_set = set()
    ch_list = []
    for name in old_header:
        if name=="GENE":
            new_header.append(name)
            translator[name] = name
            continue
        ind, samp = name.split("%")
        origin = proto_info[ind][samp]
        if ind != "Ch":
            species, tissue = origin.split("/")[-1].split(".")[:2]
            compiled_name = "_".join([species, tissue])
            new_header.append(compiled_name)
            translator[compiled_name] = name 
            sample_names[compiled_name] = samp
        else:
            species, tissue = origin.split("/")[-1].split(".")[:2]
            compiled_name_P1 = "_".join([species, tissue, P1_name])
            compiled_name_P2 = "_".join([species, tissue, P2_name])
            new_header.append(compiled_name_P1)
            new_header.append(compiled_name_P2)
            translator[compiled_name_P1] = compiled_name_P1
            translator[compiled_name_P2] = compiled_name_P2
            ch_list.append([compiled_name_P1, compiled_name_P2])
            sample_names[compiled_name_P1] = samp
            sample_names[compiled_name_P2] = samp
            sample_set.add(samp)
    ## ADD NEW ROWS FROM SAMPLE INFO
    for gene in gene_order:
        for chs in ch_list:
            sample = sample_names[chs[0]]
            expression_table[gene][chs[0]] = sample_info[sample][0][gene]["P1"]+sample_info[sample][0][gene]["P1+N"]
            expression_table[gene][chs[1]] = sample_info[sample][0][gene]["P2"]+sample_info[sample][0][gene]["P2+N"]
    
    newf = open(outfile, "w")
        
    newf.write("\t".join(new_header)+"\n")
    for gene in gene_order:
        row = []
        for name in new_header:
            row.append(str(expression_table[gene][translator[name]]))
        newf.write("\t".join(row)+"\n")
    newf.close()
    
def pretty_print(message, colour="reset"):
    base_color = "\033[39m"
    my_color = {"reset": "\033[39m", "green": "\033[32m",
                "cyan": "\033[96m", "blue": "\033[34m",
                "red": "\033[31m", "lightblue": "\033[38;5;74m",
                "orange": "\033[38;5;202m"}
    print(my_color.get(colour, "\033[39m")+message+base_color)
    
if __name__=="__main__":
    _current_version = "0.1.4"
    usage = '''
    usage: python \033[4m%prog\033[24m \033[38;5;74m[options]\033[39m \033[32m<additional inputs>\033[39m'''
    parser = OptionParser(usage)
    parser.add_option('-o', type="string", nargs=1, dest="output", default="./", help="Output directory")
    parser.add_option('-d', type="string", nargs=1, dest="directory", default="./", help="Input directory of all subsets.")
    parser.add_option('-p', type="string", nargs=1, dest="protocol", default=None, help="Protocol file. (required)")
    pretty_print("======= mergeHyLiTE script (v{}) =======".format(_current_version), "green")
    options, args = parser.parse_args()
    
    directory = options.directory
    outputdir = options.output
    protocol = options.protocol
    if protocol==None:
        raise "No protocol file given, please provide using: -p filename"
    protocol_info = read_protocol_file(protocol)
    
    potential_files = glob(join(directory, "subset*"))
    
    subset_directories = list(filter(lambda x: not isfile(x), potential_files))
    reads = []
    readssummary = []
    expression = []
    snp = []
    snpsummary = []
    runsummary = []
    for sd in subset_directories:
        reads += glob(join(sd, "*.read.txt"))
        readssummary += glob(join(sd, "*.read.summary.txt"))
        expression += glob(join(sd, "*.expression.txt"))
        snp += glob(join(sd, "*.snp.txt"))
        snpsummary += glob(join(sd, "*.snp.summary.txt"))
        runsummary += glob(join(sd, "*.run.summary.txt"))
    sample_names = get_unique_samples(reads)
    print(sample_names)
    readssummary = group_by_sample(readssummary, sample_names)
    reads = group_by_sample(reads, sample_names)
    #print(readssummary)
    for sample in sample_names:
        print("Processing {}".format(sample))
        mergefiles(readssummary[sample], join(outputdir, sample+".read.summary.txt"))
    mergefiles(expression, join(outputdir, "combined.expression.txt"))
    sample_info = {}
    for sample in sample_names:
        sample_info[sample] = process_table(join(outputdir, sample+".read.summary.txt"))
    extend_expression_information(join(outputdir, "combined.expression.txt"), sample_info,
                                  protocol_info, join(outputdir, "completed.expression.txt"))

otrnda / CloverAnalysisPipeline