Source code for bsPlugins.PairedEnd

from bsPlugins import *
from bbcflib.track import track
import rpy2.robjects as robjects
import rpy2.robjects.numpy2ri as numpy2ri
import os, tarfile

meta = {'version': "1.0.0",
        'author': "BBCF",
        'contact': "webmaster-bbcf@epfl.ch"}

in_parameters = [{'id': 'bamfiles', 'type': 'bam', 'required': True, 'multiple': 'BamMulti'},
                 {'id': 'format', 'type': 'text'}]

out_parameters = [{'id': 'statistics_plot', 'type': 'pdf'},
                  {'id': 'fragment_track', 'type': 'track'},
                  {'id': 'fragment_track_tar', 'type': 'file'}]


class PairedEndForm(BaseForm):
    child = twd.HidingTableLayout()
    class BamMulti(twb.BsMultiple):
        label='Paired-end BAM files: '
        bamfiles = twb.BsFileField(label=' ',
                                   help_text='Select bam files',
                                   validator=twb.BsFileFieldValidator(required=True))
    format = twf.SingleSelectField(label='Output format: ',
                                   options=["sql", "bedGraph", "bigWig"],
                                   prompt_text=None,
                                   help_text='Format of the output file')
    submit = twf.SubmitButton(id="submit", value="Analyze")


[docs]class PairedEndPlugin(BasePlugin): """Computes statistics and genome-wide distribution of fragment sizes from mapped paired-end reads.""" info = { 'title': 'Analysis of paired-end fragment sizes', 'description': __doc__, 'path': ['Analysis', 'Paired-end analysis'], 'output': PairedEndForm, 'in': in_parameters, 'out': out_parameters, 'meta': meta } def _compute_stats(self, bam): _plast = -1 _buff = {} for read in bam: if read.is_reverse: continue self.nb_frag += 1 _p = read.pos _s = read.isize if _p == _plast: _buff[_s] = _buff.get(_s,0)+1 else: for _size,_rep in _buff.iteritems(): self.frag_size[_size] = self.frag_size.get(_size,0)+_rep self.frag_rep[_rep] = self.frag_rep.get(_rep,0)+1 _plast = _p _buff = {} for _rep in _buff.values(): self.frag_rep[_rep] = 1+self.frag_rep.get(_rep,0) def _plot_stats(self, bam_name): robjects.r.assign('rep_cnt',numpy2ri.numpy2ri(self.frag_rep.keys())) robjects.r.assign('rep_freq',numpy2ri.numpy2ri(self.frag_rep.values())) robjects.r.assign('size_distr',numpy2ri.numpy2ri(self.frag_size.keys())) robjects.r.assign('size_freq',numpy2ri.numpy2ri(self.frag_size.values())) robjects.r.assign('nb_frag',self.nb_frag) robjects.r.assign('main',bam_name) robjects.r(""" rep_cnt = as.integer(rep_cnt) Od = order(rep_cnt) rep_freq = as.integer(rep_freq)[Od]*1e-6 rep_cnt = rep_cnt[Od] I100 = rep_cnt<100 rep_cnt = c(rep_cnt[I100],100) rep_freq = c(rep_freq[I100],sum(rep_freq[!I100])) size_distr = as.integer(size_distr) Od = order(size_distr) size_freq = as.integer(size_freq)[Od]/nb_frag size_distr = size_distr[Od] par(mfrow=c(2,1),lwd=2,cex=1.1,cex.main=1.3,cex.lab=1.1,cex.axis=.8,oma=c(0,0,3,0),mar=c(5,5,1,1),las=1,pch=20) plot(rep_cnt,rep_freq,type='s',main='Fragment redundancy',xlab='Nb of copies',ylab='Frequency (millions)', log='y',xlim=c(1,100),xaxt='n',ylim=c(1e-6,nb_frag*1e-6)) abline(h=nb_frag*1e-6,col='red') text(50,nb_frag*1e-6,nb_frag,col='red',pos=1) axis(side=1,at=seq(10,100,by=10),labels=c(seq(10,90,by=10),">100")) plot(size_distr,size_freq,type='s',main='Fragment size distribution',xlab='Size',ylab='Density') title(main=main,outer=T) """) def __call__(self, **kw): _f = ['start','end','score'] format = kw.get("format") or "sql" bamfiles = kw.get('BamMulti',{}).get('bamfiles',[]) if not isinstance(bamfiles, (tuple,list)): bamfiles = [bamfiles] bamfiles = [track(bam) for bam in bamfiles] all_tracks = [] pdf = self.temporary_path(fname='Paired_end_plots.pdf') robjects.r('pdf("%s",paper="a4",height=11,width=8)' %pdf) for bam in bamfiles: tname = "%s_frags.%s" %(bam.name, format) outname = self.temporary_path(fname=tname) all_tracks.append(outname) trout = track(outname, fields=_f, chrmeta=bam.chrmeta) self.frag_rep = {} self.frag_size = {} self.nb_frag = 0 for chrom,cval in bam.chrmeta.iteritems(): self._compute_stats(bam.fetch(chrom, 0, cval['length'])) trout.write( bam.PE_fragment_size(chrom), fields=_f, chrom=chrom ) trout.close() self._plot_stats(bam.name) robjects.r('dev.off()') if len(all_tracks)>1: tarname = self.temporary_path(fname='PE_fragment_tracks.tgz') tar_tracks = tarfile.open(tarname, "w:gz") [tar_tracks.add(f,arcname=os.path.basename(f)) for f in all_tracks] tar_tracks.close() self.new_file(tarname, 'fragment_track_tar') else: self.new_file(all_tracks[0], 'fragment_track') self.new_file(pdf,'statistics_plot') return self.display_time()

Other BBCF projects