#!/usr/bin/env perl # this script works on top of the 'halTools' toolbox # and exports a hal alignment to maf by splitting the alignment into several smaller # alignment chunks of a certain size. An overlap between two consecutive alignment chunks can # be specified. Furthermore, a list of 'genic' regions/intervals can be passed. # In this case, the splitting is restricted to intergenic regions, e.g. position # outside of the given intervals. # Stefanie Koenig 2.06.2015 use strict; use warnings; use Getopt::Long; # for parameter specification on the command line my $usage = <<'ENDUSAGE'; hal2maf_split.pl this script works on top of the 'halTools' toolbox and exports a hal alignment to maf by splitting the alignment into several smaller alignment chunks of a certain size. An overlap between two consecutive alignment chunks can be specified. Furthermore, a list of 'genic' regions/intervals can be passed. In this case, the splitting is restricted to intergenic regions, e.g. position outside of the given intervals. SYNOPSIS hal2maf_split.pl --halfile aln.hal --refGenome genome --halfile F F is the input hal file --refGenome S S is the name of the reference genome OPTIONS --help output this help message --keepDupes keep duplicates, i.e. alignments of a sequence with itself (default: off) --keepAncestors export ancestral sequences (default: off) --refSequence S S is the name of the reference sequence within the reference genome (default: all sequences in the reference genome) --outdir D D is the directory to which the output MAF files are written (default: current directory) --chunksize N size of the aligment chunk. N is the number of bases in the reference genome that are covered by the alignment chunks (default: 2500000) --overlap N overlap between to consecutive alignment chunks. N is the nunber of overlapping bases in the reference genome (default: 500000) --cpus N number of cpus (default: 1) --hal_exec_dir D D is the path to the hal executables. If not specified it must be in \$PATH environment variable. --no_split_list L list of 'genic' intervals. The splitting of the alignment is not allowed within these regions. L is a file with the following format: seqname start end . Example: chr2 120567671 120601255 chr2 120604238 120609520 chr5 65261850 65335670 chr5 56530780 865308994 ... DESCRIPTION Example: hal2maf_split.pl --halfile flies.hal --refGenome dmel --refSequence 3L --cpus 8 --hal_exec_dir /home/stefanie/tools/progressiveCactus/submodules/hal/bin ENDUSAGE sub cannot_overlap; my ($halfile, $refGenome, $keepDupes, $keepAncestors, $refSequence, $no_split_list, $help); # options my $h2m_param = ""; my $hal_exec_dir; my $chunksize = 2500000; my $overlap = 500000; my $padding = 10000; my $outdir; my $min_intergenic = 2000; # when a list of genic intervals is given, # all intervals that are within this distance are combined to a single interval my $cpus = 1; GetOptions('halfile:s'=>\$halfile, 'refGenome:s'=>\$refGenome, 'refSequence:s'=>\$refSequence, 'keepDupes!'=>\$keepDupes, 'keepAncestors!'=>\$keepAncestors, 'chunksize:i' =>\$chunksize, 'overlap:i' =>\$overlap, 'cpus:i' =>\$cpus, 'no_split_list:s' =>\$no_split_list, 'hal_exec_dir:s' =>\$hal_exec_dir, 'outdir:s' =>\$outdir, 'help!'=>\$help); if ($help){ print $usage; exit(1); } if(!defined($halfile)){ print "hal alignment missing.\n$usage"; exit(1); } if(!defined($refGenome)){ print "reference genome missing.\n$usage"; exit(1); } if(!defined($refGenome)){ print "reference genome missing.\n$usage"; exit(1); } if(!defined($keepDupes)){ $h2m_param.=" --noDupes"; } if(!defined($keepAncestors)){ $h2m_param.=" --noAncestors"; } if(defined($outdir)){ if($outdir =~/[^\/]$/){ $outdir .= '/'; } unless (-e $outdir) { mkdir $outdir; } } else{ $outdir = ""; } # check whether this perl module for paralell execution is installed my $got_ForkManager = 0; eval { require Parallel::ForkManager }; unless ($@) { $got_ForkManager = 1; Parallel::ForkManager->import(); } if (!$got_ForkManager && $cpus > 1){ print STDERR "The perl module Parallel::ForkManager is required to run hal2maf_split.pl in parallel.\n"; print STDERR "Install this module first. On Ubuntu linux install with\n"; print STDERR "sudo apt-get install libparallel-forkmanager-perl\n"; print STDERR "Will now run sequentially (--cpus=1)...\n"; $cpus = 1; } if(defined($hal_exec_dir)){ if($hal_exec_dir =~/[^\/]$/){ $hal_exec_dir .= '/'; } } else{ $hal_exec_dir = ""; } my $halStats = $hal_exec_dir . "halStats"; my $hal2maf = $hal_exec_dir . "hal2maf"; # check whether halStats and hal2maf are properly installed if (qx(which "$halStats") !~ /halStats$/){ die ("$halStats is not executable. Please add the directory which contains the executable halStats to the PATH environment variable or specify the path with --hal_exec_dir."); } if (qx(which "$hal2maf") !~ /hal2maf$/){ die ("$hal2maf is not executable. Please add the directory which contains the executable hal2maf to the PATH environment variable or specify the path with --hal_exec_dir."); } # reading in the name and length of each sequence in the reference genome ... my @seqlist = qx($halStats --chromSizes $refGenome $halfile); my $r_c = $?; # return code if($r_c != 0){ print "terminated after an error in halStats.\n"; exit(1); } # ... and store it in a hash my %seqHash = (); foreach(@seqlist){ chomp; my($seqname,$len)=split("\t",$_); next if(defined($refSequence) && $refSequence ne $seqname); # if a reference sequence is specified, skip all others $seqHash{$seqname}=$len; } if(defined($refSequence) && !exists($seqHash{$refSequence})){ print "Reference sequence $refSequence not found in reference genome $refGenome\n"; exit(1); } # reading in no_split_list my @intervals = (); if(defined($no_split_list)){ open(IN, "<$no_split_list") or die ("Could not open $no_split_list"); while(){ chomp; my($chr,$start,$end)=split("\t",$_); next if(defined($refSequence) && $refSequence ne $chr); # if a reference sequence is specified, skip all others push @intervals,[$chr, $start, $end]; } close(IN); } # join neighboring genic intervals within a distance of atmost $min_intergenic bases # sort intervals by 1. chromosome, 2. start @intervals = sort {$a->[0] cmp $b->[0] || $a->[1] <=> $b->[1]} @intervals; my @joined=(); # array of sorted, joined genic intervals: Elements: my ($chr, $start, $end); my $pred_end = 0; foreach (@intervals){ if(defined($chr) && $_->[0] eq $chr && $_->[1] - $end <= $min_intergenic ) { if($end < $_->[2]){ $end = $_->[2]; } } else { if (defined($chr)){ my $succ_start = $seqHash{$chr} - 1; # start position of succeeding interval on this sequence if($_->[0] eq $chr){ $succ_start = $_->[1]; } push @joined,[$chr, $start, $end, $pred_end, $succ_start]; $pred_end = $end; # end position of preceeding interval on this sequence if($_->[0] ne $chr){ $pred_end = 0; } } ($chr, $start, $end) = ($_->[0], $_->[1], $_->[2]); } } if (defined($chr)){ my $succ_start = $seqHash{$chr} - 1; push @joined,[$chr, $start, $end, $pred_end, $succ_start]; } # calculate start and end positions of alignment chunks # # if a list of genic intervals is specified, the start/end of an alignment chunk is shifted to # an intergenic region if it is contained in a genic interval: # # case 1) # the start position s of an alignment chunk A is contained in a genic interval # # s # |----------------------| alignment chunk A # |---------| |---------| genic intervals I1,...,Im (all disjunct) # y x # # => s is shifted to the upstream intergenic region: s=max{(x+y)/2, x-10kb} # # case 2) # the end position e of an alignment chunk A is contained in a genic interval # # e # |----------------------| alignment chunk A # |---------| |---------| genic intervals I1,...,Im (all disjunct) # x y # # => e is shifted to the downstream intergenic region: e=min{(x+y)/2, x+10kb} my @aln_chunks = (); # each element is one alignment chunk, e.g. sequence segment (seqname:start-end) in the reference foreach my $seq (sort {$a cmp $b} keys %seqHash ){ my $seqlen = $seqHash{$seq}; my $start = 0; while($start + $chunksize <= $seqlen){ my $end = $start + $chunksize - 1; # go to first genic interval that may contain the start position of the alignment chunk while (@joined && cannot_overlap($joined[0], [$seq, $start, $end])){ shift @joined; } foreach(@joined) { last if($_->[0] gt $seq || ($_->[0] eq $seq && $_->[1] > $start)); if($_->[0] eq $seq && $_->[1] <= $start && $_->[2] >= $start){ # start position of alignment chunk within a genic interval $start = int(($_->[1]+$_->[3])/2); #shift start position fo upstream intergenic region if($_->[1]-$padding > $start){ $start=$_->[1]-$padding; } last; } } foreach(@joined) { last if($_->[0] gt $seq || ($_->[0] eq $seq && $_->[1] > $end)); if($_->[0] eq $seq && $_->[1] <= $end && $_->[2] >= $end){ # end position of alignment chunk within a genic interval $end = int(($_->[2]+$_->[4])/2); # shift end position to downstream intergenic region if($_->[2]+$padding<$end){ $end=$_->[2]+$padding; } last; } } push @aln_chunks,[$seq, $start, $end]; # add alignment chunk $start = $end + 1 - $overlap; # start position of next alignment chunk } # last alignment chunk my $end = $seqlen - 1; foreach(@joined) { last if($_->[0] gt $seq || ($_->[0] eq $seq && $_->[1] > $start)); if($_->[0] eq $seq && $_->[1] <= $start && $_->[2] >= $start){ $start = int(($_->[1]+$_->[3])/2); if($_->[1]-$padding > $start){ $start=$_->[1]-$padding; } last; } } push @aln_chunks,[$seq, $start, $end]; # add alignment chunk } # export alignment chunks in parallel to maf format if($cpus > 1){ my $pm = new Parallel::ForkManager($cpus); foreach(@aln_chunks){ # this part is done in parallel by the child process my $pid = $pm->start and next; my ($seq, $start, $end) = ($_->[0], $_->[1], $_->[2]); my $chunksize = $end - $start + 1; my $cmd = "$hal2maf --refGenome $refGenome $h2m_param --refSequence $seq --start $start --length $chunksize $halfile $outdir$seq.$start-$end.maf"; print "$cmd\n"; qx($cmd); my $r_c = $?; # return code if($r_c != 0){ print "terminated after an error in hal2maf.\n"; exit(1); } $pm->finish; # terminate the child process } $pm->wait_all_children; } else{ # export alignment chunks sequentially to maf format foreach(@aln_chunks){ my ($seq, $start, $end) = ($_->[0], $_->[1], $_->[2]); my $chunksize = $end - $start + 1; my $cmd = "$hal2maf --refGenome $refGenome $h2m_param --refSequence $seq --start $start --length $chunksize $halfile $outdir$seq.$start-$end.maf"; print "$cmd\n"; qx($cmd); my $r_c = $?; if($r_c != 0){ print "terminated after an error in hal2maf.\n"; exit(1); } } } # returns true if interval a # - comes before interval b in the sorting order and # - does not overlap with b sub cannot_overlap { my $a = shift; my $b = shift; if ($a->[0] lt $b->[0] or ( $a->[0] eq $b->[0] and $a->[2] < $b->[1])){ return 1; } return 0; }