brentp / hts-nim

nim wrapper for htslib for parsing genomics data files

Home Page:https://brentp.github.io/hts-nim/

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Modifying GT from ivcf and write to ovcf for multiple pairs of inputs and outputs

ruqianl opened this issue · comments

Hi Brent,

I'm trying to read in Variants from an input VCF and write to a output VCF after modifying the GT field. The code snippet I have works for one input and one output. However, when I put that into a for loop to loop through multiple chroms, it errors out as:

free(): invalid pointer
Traceback (most recent call last)
/mnt/mcfiles/rlyu/Projects/sgcocaller/tests/read_write_vcf.nim(41) read_write_vcf
/mnt/mcfiles/rlyu/Projects/sgcocaller/tests/read_write_vcf.nim(20) write_linked_blocks
/mnt/mcfiles/rlyu/.nimble/pkgs/hts-0.3.12/hts/vcf.nim(743) copy
/usr/local/bin/nim-1.4.0/lib/system/gc.nim(439) newObj
/usr/local/bin/nim-1.4.0/lib/system/gc_common.nim(430) prepareDealloc
/mnt/mcfiles/rlyu/.nimble/pkgs/hts-0.3.12/hts/vcf.nim(423) destroy_vcf
SIGABRT: Abnormal termination.

Code:

## test open new vcf and close
import tables
import math
import sequtils
import hts

let threads  = 1

proc write_linked_blocks(ivcf:VCF, ovcf:VCF):int = 

  ovcf.header = ivcf.header
  discard ovcf.header.add_string("""##sgcocaller_v0.1=phaseBlocks""")
  discard ovcf.write_header()
  var gt_string:seq[int32]
  for v in ivcf.query("*"):
    gt_string = @[int32(2),int32(5)]
    if v.format().set("GT",gt_string) != Status.OK:
      quit "set GT failed"
    if not ovcf.write_variant(v) :
      quit "write vcf failed for " & $voff
  return 0

var chrs = map(toSeq(1..2), proc (x:int): string = "chr" & $x) 

var ivcf,ovcf: VCF
var inFile,outFile:string

for chrName in chrs:
  echo "writing for " & chrName
  inFile  = "data/FVB_NJ.mgp.v5.snps.dbSNP142.homo.alt." & chrName & ".vcf.gz"
  outFile = "tests/" & chrName & "_block_phased.vcf.gz"
  if not open(ivcf, inFile, threads=threads):
      quit "couldn't open input vcf file"
  if not open(ovcf, outFile, threads=threads, mode ="w"):
    quit "couldn't open output vcf file"
  discard write_linked_blocks(ivcf,ovcf)
  ivcf.close()
  ovcf.close()

I might not be handling some pointer operation right. Thanks for any advice.

Best,
Ruqian

Hi, I'm not sure what the problem could be. One guess is to switch these 2 lines:

 ovcf.header = ivcf.header
 discard ovcf.header.add_string("""##sgcocaller_v0.1=phaseBlocks""")

to be:

 discard ivcf.header.add_string("""##sgcocaller_v0.1=phaseBlocks""")
 ovcf.header = ivcf.header

so that you add the header to the input vcf and then copy.

if that doesn't work, can you paste an example VCF with header and a single variant so I can debug?

Thanks Brent, I quickly had a go but it didn't work for my original files.
So I created smaller vcf files with a few variants and tested the code again, it doesn't produce errors anymore..

I could still reproduce the error with the file I uploaded. You could just copy the vcf and rename it to chr2.vcf.gz and run the following:

## test open new vcf and close
import tables
import math
import sequtils
import hts

let threads  = 1

proc write_linked_blocks(ivcf:VCF, ovcf:VCF):int = 
  var current_line: string
  var v_off = 1
  var currentBlock = 0
  discard ivcf.header.add_string("""##sgcocaller_v0.1=phaseBlocks""")
  ovcf.header = ivcf.header
  discard ovcf.write_header()
  var gt_string:seq[int32]
  var block_pos_i = 0
  for v in ivcf.query("*"):
    gt_string = @[int32(2),int32(5)]
    if v.format().set("GT",gt_string) != Status.OK:
      quit "set GT failed"
    if not ovcf.write_variant(v) :
      quit "write vcf failed for " & $voff
  return 0

var chrs = map(toSeq(1..2), proc (x:int): string = "chr" & $x) 

var ivcf,ovcf: VCF
var inFile,outFile:string

for chrName in chrs:
  echo "writing for " & chrName
  inFile  = "tests/FVB_NJ.mgp.v5.snps.dbSNP142.homo.alt." & chrName & ".vcf.gz"
  outFile = "tests/FVB_NJ.mgp.v5.snps.dbSNP142.homo.alt." & chrName & "_block_phased.vcf.gz"
  if not open(ivcf, inFile, threads=threads):
      quit "couldn't open input vcf file"
  if not open(ovcf, outFile, threads=threads, mode ="w"):
    quit "couldn't open output vcf file"
  discard write_linked_blocks(ivcf,ovcf)
  ivcf.close()
  ovcf.close()

FVB_NJ.mgp.v5.snps.dbSNP142.homo.alt.chr1.zip.zip

Hi, I ran this with only changing:

var chrs = @["chr1"]

and it succeeds without problem. Maybe you could try updating hts-nim and nim. I have:

$ nim --version
Nim Compiler Version 1.5.1 [Linux: amd64]
Compiled at 2021-06-21
Copyright (c) 2006-2021 by Andreas Rumpf

git hash: ad70a65e0e3eda39a3ca074af9feadb68f10598f

and latest tagged hts-nim (though I don't think that there have been any relevant, recent changes).

Thanks for testing.

Yeah, it works for me when only running one chromosome too, but only looping through more than one chromosome would trigger the error.

I see. you should use:

  ovcf.copy_header(ivcf.header)

instead of ovcf.header = ivcf.header otherwise they are pointing to the same object and it gets freed twice. I am pushing a fix for the latter now, but you can and should use copy_header (you'll have to adjust the signature of the funciton to take a var VCF instead of just VCF