#!/usr/bin/perl -w ################################################ # written by Belinda Giardine ################################################ #expects caller to break apart mutations with multiple changes #numbering expected to be coding (c.*) #gvPos format #we only have: chrom start stop name baseChangeType use strict; my $pslFile = shift @ARGV; my $gbAcc = shift @ARGV; my $name = shift @ARGV; my $pslLine = ''; if (@ARGV) { $pslLine = shift @ARGV; $pslLine = "'$pslLine'"; } my $nameCopy = $name; if (!$name or !$pslFile or !$gbAcc) { print "USAGE: parseHgvsName2 file.psl GenBankAcc 'hgvs_name' ['pslLine']\n"; exit; } my $converter = 'convert_coors2'; my $strand; $name =~ s/^\w+://;#remove gene or refseq if present $name =~ s/^c\.//; #remove if present if ($name =~ /^\s*'*p\./) { if ($name =~ /^\s*'*p\.\w(\d+)\w'*\s*$/) { my $pos = $1; my @chr = getProtCoors($pos, $gbAcc); if ($chr[0] =~ /ERROR/) { print $chr[0]; exit; } print "$chr[0]\t$chr[1]\t$chr[2]\t$name\tsubstitution\t$chr[3]\n"; }elsif ($name =~ /^\s*'*p\.[A-Z][a-z]{2}(\d+)[A-Z][a-z]{2}'*\s*$/ or $name =~ /^\s*'*p\.[A-Z][a-z]{2}(\d+)X'*\s*$/) { my $pos = $1; my @chr = getProtCoors($pos, $gbAcc); if ($chr[0] =~ /ERROR/) { print $chr[0]; exit; } print "$chr[0]\t$chr[1]\t$chr[2]\t$name\tsubstitution\t$chr[3]\n"; }elsif ($name =~ /^\s*p\.\w(\d+)fsX*\d*\s*$/) { #an unspecified frameshift insertion or deletion, just mark first codon my $pos = $1; my @chr = getProtCoors($pos, $gbAcc); if ($chr[0] =~ /ERROR/) { print $chr[0]; exit; } print "$chr[0]\t$chr[1]\t$chr[2]\t$name\tsubstitution\t$chr[3]\n"; }elsif ($name =~ /^\s*p\.\w(\d+)del\s*$/) { #deletion of codon/amino acid my $pos = $1; my @chr = getProtCoors($pos, $gbAcc); if ($chr[0] =~ /ERROR/) { print $chr[0]; exit; } print "$chr[0]\t$chr[1]\t$chr[2]\t$name\tdeletion\t$chr[3]\n"; }elsif ($name =~ /^\s*p\.\w(\d+)_\w(\d+)del\s*$/) { #deletion of multiple codons/amino acids my $pos = $1; my $pos2 = $2; my @chr = getProtCoors($pos, $gbAcc); if ($chr[0] =~ /ERROR/) { print $chr[0]; exit; } my @chr2 = getProtCoors($pos2, $gbAcc); if ($chr2[0] =~ /ERROR/) { print $chr2[0]; exit; } if ($chr[1] > $chr2[1]) { print "$chr[0]\t$chr2[1]\t$chr[2]\t$name\tdeletion\t$chr[3]\n"; }else { print "$chr[0]\t$chr[1]\t$chr2[2]\t$name\tdeletion\t$chr[3]\n"; } }else { print "ERROR sorry didn't recognize protein based name $name.\n"; } }elsif ($name =~ /delins/) { if ($name =~ /([-*]?\d+[+*-]?\d*_?[-*]?\d*[+*-]?\d*)\s*delins(.*)/) { my $pos = $1; my $ins = $2; my $pos2 = $pos; if ($pos =~ /([-*]?\d+[+*-]?\d*)_([*-]?\d+[+*-]?\d*)/) { $pos = $1; $pos2 = $2; } #convert to chrom and print my($chr, $cst) = convert_pos($pos); my $cend; if ($pos eq $pos2 && $cst) { $cend = $cst; } if ($chr =~ /in a gap/) { ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr); } if ($chr =~ /ERROR/) { print $chr; exit; } if (!$cend) { ($chr, $cend) = convert_pos($pos2); if ($chr =~ /in a gap/) { ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr); } if ($chr =~ /ERROR/) { print $chr; exit; } } if ($cend < $cst) { #-strand need to switch order my $t = $cend; $cend = $cst; $cst = $t; } print "$chr\t$cst\t$cend\t$nameCopy\tcomplex\t$strand\n"; } }elsif ($name =~ /del|inv/) { #need to limit front to prevent 'ex11-ex12del' as coming through as '12del' if ($name =~ /^[cg]?\.?([-*]?\d+[+*-]?\d*(_[-*]?\d+)?[+*-]?\d*)\s*(del|inv)([ACTG]*\d*)/) { my $pos = $1; my $t = $3; my $del = $4; my $pos2 = $pos; if ($pos =~ /([-*]?\d+[+*-]?\d*)_([-*]?\d+[+*-]?\d*)/) { $pos = $1; $pos2 = $2; } elsif ($name =~ /([-*]?\d+[+*-]?\d*)del(\d+)$/) { $pos2 += ($2 - 1); } my $len; if ($del =~ /\D/) { $len = length $del; } elsif ($del) { $len = $del; } #allow different lengths, may be unclear which deleted #if ($len && $pos !~ /\d+[-+*]\d+/ && $pos2 !~ /\d+[-+*]\d+/ && $pos2 - $pos != $len - 1) { #print "ERROR lengths of deletion are not equal\n"; #exit; #} #convert to chrom and print in gvPos format my($chr, $cst) = convert_pos($pos); my $cend; if ($chr =~ /in a gap/) { ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr); } if ($chr =~ /ERROR/) { print $chr; exit; } if (!$cend) { ($chr, $cend) = convert_pos($pos2); if ($chr =~ /in a gap/) { ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr); } if ($chr =~ /ERROR/) { print $chr; exit; } } if ($cend < $cst) { #-strand need to switch order my $t = $cend; $cend = $cst; $cst = $t; } my $type = 'deletion'; if ($nameCopy =~ /del[ACTG0-9]+\s{0,1}ins[0-9ACTG]+/) { $type = 'complex'; } if ($t eq 'inv') { $type = 'complex'; } #inversion print "$chr\t$cst\t$cend\t$nameCopy\t$type\t$strand\n"; }elsif ($name =~ /^[cg]?\.?([-*]?\d+[+*-]?(\d+|\?)?_?[-*]?\d*[+*-]?(\d+|\?)?)\s*del([ACTG]*\d*)/) { my $pos = $1; my $del = $4; my $pos2 = $pos; if ($pos =~ /([-*]?\d+[+*-]?\d*\??)_([-*]?\d+[+*-]?\d*\??)/) { $pos = $1; $pos2 = $2; } else { print "ERROR couldn't map fuzzy name $name"; exit; } $pos =~ s/\?/1/; #replace unknown with first nt $pos2 =~ s/\?/1/; my $len; if ($del =~ /\D/) { $len = length $del; } elsif ($del) { $len = $del; } #convert to chrom and print in gvPos format my($chr, $cst) = convert_pos($pos); my $cend; if ($chr =~ /ERROR/) { print $chr; exit; } ($chr, $cend) = convert_pos($pos2); if ($chr =~ /ERROR/) { print $chr; exit; } if ($cend < $cst) { #-strand need to switch order my $t = $cend; $cend = $cst; $cst = $t; } my $type = 'deletion'; if ($nameCopy =~ /del[ACTG0-9]+\s{0,1}ins[0-9ACTG]+/) { $type = 'complex'; } print "$chr\t$cst\t$cend\t$nameCopy\t$type\t$strand\n"; }elsif ($name =~ /^[cg]?\.?\(\?_([-*]?\d+[+*-]?\d*)\)\-?\??_\(([-*]?\d+[+*-]?\d*)_\?\)\+?\??\s*(del|dup|inv)/ || $name =~ /^[cg]?\.?([-*]?\d+[+*-]?\d*)_\(([-*]?\d+[+*-]?\d*)_\?\)\s*(del| dup|inv)/ || $name =~ /^[cg]?\.?\(\?_([-*]?\d+[+*-]?\d*)\)_([-*]?\d+[+*-]?\d*)\s*(del|dup|inv)/ || $name =~ /^[cg]?\.?([-*]?\d+[+*-]\?)_\(([-*]?\d+[+*-]?\d*)_\?\)\+?\??\s*(del|dup|inv)/ || $name =~ /^[cg]?\.?\(\?_([-*]?\d+[+*-]?\d*)\)_([-*]?\d+[+*-]\?)\s*(del|dup|inv)/ || $name =~ /^[cg]?\.?\(\?_([-*]?\d+[+*-]?\d*)\)_\((\*\d+)_\?\)(del|dup|inv)/ || $name =~ /[cg]?\.?([-*]?\d+[+*-]\?)_\(([-*]?\d+[+*-]?\d*)_[-*]?\d+[+*-]?\d*\)\+\?(del|dup|inv)/) { #have minimum size, max is unknown my $pos = $1; my $t = $3; my $pos2 = $2; if ($pos =~ /\?/) { $pos =~ s/\?/1/; } if ($pos2 =~ /\?/) { $pos2 =~ s/\?/1/; } #convert to chrom and print in gvPos format my($chr, $cst) = convert_pos($pos); my $cend; if ($chr =~ /ERROR/) { print $chr; exit; } ($chr, $cend) = convert_pos($pos2); if ($chr =~ /ERROR/) { print $chr; exit; } if ($cend < $cst) { #-strand need to switch order my $t = $cend; $cend = $cst; $cst = $t; } my $type = 'deletion'; if ($t eq 'del' && $nameCopy =~ /del[ACTG0-9]+\s{0,1}ins[0-9ACTG]+/) { $type = 'complex'; } elsif ($t eq 'dup') { $type = 'duplication'; } elsif ($t eq 'inv') { $type = 'complex'; } print "$chr\t$cst\t$cend\t$nameCopy\t$type\t$strand\n"; }elsif ($name =~ /([A-Z0-9._]+):(g\.)?(\d+)_(o\.)?([A-Z0-9._]+):(g\.)?(\d+)del/) { #large deletion across reference sequences (assume genomic) my $ref = $1; my $pos = $3; my $ref2 = $5; my $pos2 = $7; $gbAcc = $ref; $pslLine = ''; my($chr, $cst) = convert_pos($pos); my $cend; if ($chr =~ /ERROR/) { print $chr; exit; } my $off = getOffset($gbAcc); $cst += $off; $gbAcc = $ref2; ($chr, $cend) = convert_pos($pos2); if ($chr =~ /ERROR/) { print $chr; exit; } $off = getOffset($gbAcc); $cend += $off; if ($cend < $cst) { #-strand need to switch order my $t = $cend; $cend = $cst; $cst = $t; } my $type = 'deletion'; print "$chr\t$cst\t$cend\t$nameCopy\t$type\t$strand\n"; }else { print "ERROR bad deletion format for $nameCopy\n"; } }elsif ($name =~ /ins/) { if ($name =~ /([-*]?\d+[+*-]?\d*_?[-*]?\d*[+*-]?\d*)\s*ins[ACTGactg]*\d*/) { my $pos = $1; my $pos2 = $pos; if ($pos =~ /([-*]?\d+[+*-]?\d*)_([-*]?\d+[+*-]?\d*)/) { $pos = $1; $pos2 = $2;} #convert to chrom and print in gvPos format my($chr, $cst) = convert_pos($pos); my $cend; if ($chr =~ /in a gap/) { ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr); } if ($chr =~ /ERROR/) { print $chr; exit; } if (!$cend) { ($chr, $cend) = convert_pos($pos2); if ($chr =~ /in a gap/) { ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr); } if ($chr =~ /ERROR/) { print $chr; exit; } } if ($cend < $cst) { #-strand need to switch order my $t = $cend; $cend = $cst; $cst = $t; } print "$chr\t$cst\t$cend\t$nameCopy\tinsertion\t$strand\n"; }else { print "ERROR bad insertion format for $nameCopy\n"; } }elsif ($name =~ /dup/) { if ($name =~ /([-*]?\d+[+*-]?\d*_?[-*]?\d*[+*-]?\d*)dup[ACTGactg]*\d*/) { my $pos = $1; my $pos2 = $pos; if ($pos =~ /([-*]?\d+[+*-]?\d*)_([-*]?\d+[+*-]?\d*)/) { $pos = $1; $pos2 = $2;} #convert to chrom and print in gvPos format my($chr, $cst) = convert_pos($pos); my $cend; if ($chr =~ /in a gap/) { ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr); } if ($chr =~ /ERROR/) { print $chr; exit; } if (!$cend) { ($chr, $cend) = convert_pos($pos2); if ($chr =~ /in a gap/) { ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr); } if ($chr =~ /ERROR/) { print $chr; exit; } } if ($cend < $cst) { #-strand need to switch order my $t = $cend; $cend = $cst; $cst = $t; } print "$chr\t$cst\t$cend\t$nameCopy\tduplication\t$strand\n"; }elsif ($name =~ /([-*]?\d+[+*-]?\d*)dup[ACTGactg]*\d*/) { my $pos = $1; my($chr, $cst) = convert_pos($pos); my $cend; if ($chr =~ /in a gap/) { ($chr, $cst, $cend) = redo_gap($pos, $pos, $chr); } if ($chr =~ /ERROR/) { print $chr; exit; } if (!$cend) { $cend = $cst; } print "$chr\t$cst\t$cend\t$nameCopy\tduplication\t$strand\n"; }elsif ($name =~ /\(\?_([-*]?\d+)\)_\(([-*]?\d+)_\?\)dup/) { my $pos = $1; my $pos2 = $2; #convert to chrom and print in gvPos format my($chr, $cst) = convert_pos($pos); my $cend; if ($chr =~ /in a gap/) { ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr); } if ($chr =~ /ERROR/) { print $chr; exit; } if (!$cend) { ($chr, $cend) = convert_pos($pos2); if ($chr =~ /in a gap/) { ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr); } if ($chr =~ /ERROR/) { print $chr; exit; } } if ($cend < $cst) { #-strand need to switch order my $t = $cend; $cend = $cst; $cst = $t; } #this should be marked as estimated coordinates by caller print "$chr\t$cst\t$cend\t$nameCopy\tduplication\t$strand\n"; }else { print "ERROR bad duplication format for $nameCopy\n"; } }elsif ($name =~ /inv/) { if ($name =~ /([-*]?\d+[+*-]?\d*_?[-*]?\d*[+*-]?\d*)inv\d*/) { my $pos = $1; my $pos2 = $pos; if ($pos =~ /([-*]?\d+[+*-]?\d*)_([-*]?\d+[+*-]?\d*)/) { $pos = $1; $pos2 = $2;} #convert to chrom and print in gvPos format my($chr, $cst) = convert_pos($pos); my $cend; if ($chr =~ /in a gap/) { ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr); } if ($chr =~ /ERROR/) { print $chr; exit; } if (!$cend) { ($chr, $cend) = convert_pos($pos2); if ($chr =~ /in a gap/) { ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr); } if ($chr =~ /ERROR/) { print $chr; exit; } } if ($cend < $cst) { #-strand need to switch order my $t = $cend; $cend = $cst; $cst = $t; } #should we add an inversion type? print "$chr\t$cst\t$cend\t$nameCopy\tcomplex\t$strand\n"; }else { print "ERROR bad inversion format for $nameCopy\n"; } }elsif ($name =~ /\>/) { if ($name =~ /([-*]?\d+[+*-]?\d*)\s*[ACTGactg]\>[ACTGactg]/) { my $pos = $1; my ($chr, $cst) = convert_pos($pos); my $cend; if ($chr =~ /in a gap/) { ($chr, $cst, $cend) = redo_gap($pos, $pos, $chr); } if (!$chr or $chr =~ /ERROR/) { print $chr; exit; } if (!$cend) { $cend = $cst; } if (!$cst) { print "ERROR missing start for $nameCopy and $chr\n"; exit; } print "$chr\t$cst\t$cend\t$nameCopy\tsubstitution\t$strand\n"; }else { print "ERROR bad substitution format [$name]\n"; } }elsif ($name =~ /([-*]?\d+[+*-]?\d*)\s*([ACTG]+)\(\d+(_\d+)?\)/ or $name =~ /([-*]?\d+[+*-]?\d*)\s*([ACTG]+)\[\d+(_\d+)?\]/) { #repeating nts my $pos = $1; my $nts = $2; my $len = length $nts; my $pos2; if ($pos =~ /(\d+)([+-]\d+)/) { my $t = $2 + $len - 1; if ($t > 0) { $t = "+$t"; } $pos2 = "$1$t"; }elsif ($pos =~ /(\d+)\*(\d+)/) { my $t = $2 + $len - 1; $pos2 = "$1*$t"; }else { $pos2 = $pos + $len - 1; } #convert to chrom and print in gvPos format my($chr, $cst) = convert_pos($pos); my $cend; if ($chr =~ /in a gap/) { ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr); } if ($chr =~ /ERROR/) { print $chr; exit; } if (!$cend) { ($chr, $cend) = convert_pos($pos2); if ($chr =~ /in a gap/) { ($chr, $cst, $cend) = redo_gap($pos, $pos2, $chr); } if ($chr =~ /ERROR/) { print $chr; exit; } } if ($cend < $cst) { #-strand need to switch order my $t = $cend; $cend = $cst; $cst = $t; } print "$chr\t$cst\t$cend\t$nameCopy\tcomplex\t$strand\n"; }else { print "ERROR didn't recognize format of $nameCopy\n"; } exit; #this converts a coding position to a chrom position sub convert_pos { my $pos = shift @_; my $chr; my $cnum; my $fh; open($fh, "$converter $pslFile $gbAcc '$pos' $pslLine 2>&1 $pslLine |") or die "ERROR Couldn't run convert_coors2, $!\n"; while (<$fh>) { if (/ERROR/) { $chr = $_; last; } /is mapped to (\w+\d*) (\d+) (\+|\-)/; $cnum = $2; $chr = $1; $strand = $3; } if (!$chr) { close $fh; return("ERROR couldn't convert '$pos'", undef); } close $fh or die "ERROR Couldn't finish convert_coors run with $converter $pslFile $gbAcc $pos $pslLine, $!\n"; if ($chr =~ /ERROR/ && $pos !~ /\d+[\+\*-]\d+/ && $chr =~ /end of gene at (\d+)/) { my $end = $1; my $diff = $pos - $end; open($fh, "$converter $pslFile $gbAcc $end+$diff $pslLine 2>&1 |") or die "ERROR Couldn't run convert_coors2, $!\n"; while (<$fh>) { if (/ERROR/) { $chr = $_; last; } /is mapped to (\w+\d*) (\d+) (\+|\-)/; $cnum = $2; $chr = $1; $strand = $3; } close $fh or die "ERROR Couldn't finish convert_coors run with $converter $pslFile $gbAcc $end+$diff $pslLine, $!\n"; } return($chr, $cnum); } ####End #this redoes mutations in gaps as if an insertion in the chrom coors sub redo_gap { my $pos = shift; my $pos2 = shift; my $chr = shift; my $cst; my $cend; while ($chr =~ /in a gap/) { $pos--; if ($pos <= 0) { last; } ($chr, $cst) = convert_pos($pos); } if ($chr =~ /ERROR/) { return ($chr, $cst, $cend); } ($chr, $cend) = convert_pos($pos2); while ($chr =~ /in a gap/) { $pos2++; ($chr, $cend) = convert_pos($pos2); } $nameCopy .= ' #insertion in reference#'; if ($cst > $cend) { #minus strand switch order my $t = $cst; $cst = $cend; $cend = $t; } if ($cend - $cst > 10) { $chr = 'ERROR did not map to chrom sequence'; } return ($chr, $cst, $cend); } ####End sub getProtCoors { my $pos = shift; my $acc = shift; my @chr; my $fh; my $command = "convert_prot_coors3 $pslFile $acc $pos 2>&1 $pslLine |"; open ($fh, $command) or die "ERROR Couldn't run convert_prot_coors3 $pslFile $acc $pos, $!\n"; while (<$fh>) { chomp; if (/ERROR/) { $chr[0] = $_; last; } @chr = split(/\t/); } close $fh or die "Couldn't finish convert_prot_coors3 $pslFile $acc $pos $pslLine, $!\n"; return @chr; } ####End sub getOffset { my $acc = shift; open(FH, '<', $pslFile) or die "ERROR couldn't open gene decription file, $pslFile, $!\n"; my @f; my $off = 0; while () { chomp; if (!defined or $_ eq '') { next; } if (/\s*#/) { next; } #comment @f = split(/\s+/); if (scalar @f < 21) { die "ERROR bad psl file format $_\n"; } elsif (scalar @f == 22) { shift @f; } #remove bin, not in downloads if ($f[9] eq $acc) { last; } elsif ($f[9] =~ /^$acc:(\d+)$/) { $off = $1; last; } #have seq with offset } close FH or die "ERROR reading psl file, $pslFile $!\n"; if ($f[9] !~ /$acc/) { die "ERROR failed to find $acc in psl file\n"; } return $off; } ####End