Source code for bsPlugins.DESeq

from bsPlugins import *
from bbcflib.track import track
from bbcflib import genrep
import rpy2.robjects as robjects
import rpy2.robjects.numpy2ri as numpy2ri
import numpy
import os,shutil
import itertools

ftypes = [(0, 'genes bodies'), (1, 'gene promoters'), (2, 'exons'), (3, 'custom upload')]
prom_up_def = 1000
prom_down_def = 100

__requires__ = ["ryp2", "numpy"]


class DESeqForm(BaseForm):
    child = twd.HidingTableLayout()
    input_type = twd.HidingRadioButtonList(label='Input type: ',
                                           options=['Table', 'Signals'],
                                           mapping={'Table': ['table'],
                                                    'Signals': ['Group1','Group2','feature_type','assembly'],},
                                           value='Table',
                                           help_text='Select input type (Formatted table, or signal tracks)')
    table = twb.BsFileField(label='Table: ',
        help_text='Select scores table',
        validator=twb.BsFileFieldValidator(required=True))
    class Group1(twb.BsMultiple):
        label = 'Signals group 1: '
        signals1 = twb.BsFileField(label=' ',
            help_text='Select signal files (position and score, e.g. bedgraph)',
            validator=twb.BsFileFieldValidator(required=True))
    class Group2(twb.BsMultiple):
        label = 'Signals group 2: '
        signals2 = twb.BsFileField(label=' ',
            help_text='Select signal files (position and score, e.g. bedgraph)',
            validator=twb.BsFileFieldValidator(required=True))
    feature_type = twd.HidingSingleSelectField(label='Feature type: ',
        options=ftypes, prompt_text=None,
        mapping={ftypes[-1][0]: ['features'],
                 1: ['upstream', 'downstream']},
        help_text='Choose a feature set or upload your own',
        validator=twc.Validator(required=True))
    features = twb.BsFileField(label='Custom feature set: ',
        help_text='Select a feature file (e.g. bed)',
        validator=twb.BsFileFieldValidator(required=True))
    upstream = twf.TextField(label='Promoter upstream distance: ',
        validator=twc.IntValidator(required=True),
        value=prom_up_def,
        help_text='Size of promoter upstream of TSS')
    downstream = twf.TextField(label='Promoter downstream distance: ',
        validator=twc.IntValidator(required=True),
        value=prom_down_def,
        help_text='Size of promoter downstream of TSS')

    assembly = twf.SingleSelectField(label='Assembly: ',
        options=genrep.GenRep().assemblies_available(),
        validator=twc.Validator(required=True),
        help_text='Reference genome')
    submit = twf.SubmitButton(id="submit", value="Submit")


meta = {'version': "1.0.0",
        'author': "BBCF",
        'contact': "webmaster-bbcf@epfl.ch"}

in_parameters = [
        {'id': 'input_type', 'type': 'radio'},
        {'id': 'signals1', 'type': 'track', 'required': True, 'multiple': 'Group1'},
        {'id': 'signals2', 'type': 'track', 'required': True, 'multiple': 'Group2'},
        {'id': 'table', 'type': 'txt', 'required': True},
        {'id': 'feature_type', 'type': 'int'},
        {'id': 'upstream', 'type': 'int'},
        {'id': 'downstream', 'type': 'int'},
        {'id': 'assembly', 'type': 'assembly'},
        {'id': 'features', 'type': 'track'},
]
out_parameters = [{'id': 'differential_expression', 'type': 'file'}]


[docs]class DESeqPlugin(BasePlugin):
    """Gets the score associated to each genomic feature in each sample and runs DESeq
for differential analysis within them. It returns a tab-delimited file with the following fields:

Name, MeanA, MeanB, fold change, adjusted p-value.

The input can be of two different types:

* Two sets of 'signal' files - i.e. bedGraph-type text files - one for each of the two groups to compare -,
  and a list of genomic features - either from a pre-defined list such as Ensembl genes,
  or a custom bed-like file. For every feature, a score is given for each of the signal samples,
  and DESeq is run on the resulting table. The name of each sample is the one given in the track
  definition line ("track name=... description=... etc."), if specified, otherwise the name of
  the file (without extension).
* A tab-delimited table with feature names in the first column, then one column of respective
  scores per sample. The first line is a header of the type "id  sample1  sample2 ...".
  If sample names are in the format 'group_name.run_id', all samples with the same group_name
  will be considered as replicates of the same group/condition. Else they are considered as belonging
  to different groups. If there are more than 2 groups, all different pairs of comparisons
  will be performed and output in separate files.

    """
    info = {
        'title': 'Differential expression analysis',
        'description': __doc__,
        'path': ['Analysis', 'DE analysis'],
        'output': DESeqForm,
        'in': in_parameters,
        'out': out_parameters,
        'meta': meta,
        }

[docs]    def clean_deseq_output(self,filename,contrast):
        """Delete all lines of *filename* with NA's everywhere, add 0.5 to zero scores
        before recalculating the fold change, remove row numbers, and keep only the following
        fields: Name, MeanA, MeanB, fold change, adjusted p-value. Return the new file name."""
        filename_clean = self.temporary_path()
        with open(filename,"rb") as f:
            with open(filename_clean,"wb") as g:
                f.readline() # header
                A = 'Mean.'+contrast[0].strip()
                B = 'Mean.'+contrast[1].strip()
                g.write('-'.join(contrast)+'\n')
                g.write('\t'.join(['Name',A,B,'foldChange','padj'])+'\n')
                for line in f:
                    line = line.strip().split("\t")
                    if not (line[2]=="0" and line[3]=="0") and not (line[2]=='NA' or line[3]=='NA'):
                        meanA = float(line[2]) or 0.5
                        meanB = float(line[3]) or 0.5
                        fold = meanB/meanA
                        line = '\t'.join([line[0],str(meanA),str(meanB),str(fold),line[7]])+'\n'
                        g.write(line)
        return filename_clean

    def __call__(self, **kw):
        if kw.get('input_type') == 'Table':
            filename = kw.get('table')
            assert os.path.exists(str(filename)), "File not found: '%s'" % filename
            colnames = numpy.asarray(open(filename).readline().split()[1:])
            robjects.r.assign('col_names', numpy2ri.numpy2ri(colnames))
            robjects.r("""
            Mdata <- read.table('%s',sep='\t',header=T,row.names=1)
            conds <- unlist(strsplit(col_names,".",fixed=T))
            conds <- colnames(Mdata)
            """ % filename)
        else:
            from QuantifyTable import QuantifyTablePlugin
            assembly = genrep.Assembly(kw.get('assembly'))
            chrmeta = assembly.chrmeta or "guess"
            kw['score_op'] = 'sum'
            signals1 = kw['Group1']['signals1']
            signals2 = kw['Group2']['signals2']
            if not isinstance(signals1,(list,tuple)): signals1 = [signals1]
            if not isinstance(signals2,(list,tuple)): signals2 = [signals2]
            kw['SigMulti'] = {'signals': signals1+signals2} # to pass it to QuantifyTable plugin
            signals = kw['SigMulti']['signals']
            table = QuantifyTablePlugin().quantify(**kw)
            stracks = []
            norm_factors = []
            for sig in signals:
                assert os.path.exists(str(sig)), "Signal file not found: '%s'." % sig
                _t = track(sig, chrmeta=chrmeta)
                if 'normalization' in _t.info:
                    _nf = float(_t.info['normalization'])
                elif 'nreads' in _t.info:
                    _nf = float(_t.info['nreads']) * 1e-7 / float(_t.info.get('read_extension', 1))
                else:
                    _nf = 1
                stracks.append(_t)
                norm_factors.append(_nf)
            t = track(table,chrmeta=chrmeta,fields=['chr','start','end','name']+ \
                                                   ['score%d'%x for x in range(len(signals))])
            _f = [f for f in t.fields if f.startswith('score')]
            de_list = list(t.read(fields=['name']+_f))
            t.close(); os.remove(table)
            # Turn all scores into integers
            de_matrix = numpy.asarray([[int(float(s) * norm_factors[k] + .5) for k,s in enumerate(x[1:])]
                                       for x in de_list], dtype=numpy.float)
            rownames = numpy.asarray([x[0] for x in de_list])
            colnames = numpy.asarray([s.info.get('name',os.path.splitext(os.path.basename(s.path))[0])
                                      for s in stracks])
             # if all prefixes are identical within a group, keep this prefix as group identifier.
            if len(list(set( [x.split('.')[0] for x in colnames[:len(signals1)]] ))) == 1 \
            and len(list(set( [x.split('.')[0] for x in colnames[len(signals1):]] ))) == 1:
                group1 = colnames[0].split('.')[0]
                group2 = colnames[-1].split('.')[0]
            else:
                group1 = "Group1"
                group2 = "Group2"
            conds = [group1]*len(signals1) + [group2]*len(signals2)
            robjects.r.assign('Mdata', numpy2ri.numpy2ri(de_matrix))
            robjects.r.assign('row_names', numpy2ri.numpy2ri(rownames))
            robjects.r.assign('col_names', numpy2ri.numpy2ri(colnames))
            robjects.r.assign('conds', numpy2ri.numpy2ri(conds))
            robjects.r("""
            Mdata <- as.data.frame(Mdata,row.names=row_names)
            conds <- unlist(col_names)
            colnames(Mdata) <- conds
            """)

        robjects.r("""
        ### Still need to check that replicates are not identical - lfproc would fail
        groups <- unique(conds)
        couples <- combn(groups,2)
        if (any(table(conds)>1)){ method = 'pooled' # if replicates
        } else { method = 'blind' }
        """)

        robjects.r("""
        library(DESeq)
        if (all(table(conds)>=3)){        # if >3 replicates in all conditions
            method = 'per-condition'        # for each group estimate the variance from its replicates
            sharingMode = 'gene-est-only'   # use the per-gene variance estimates only
        } else if (any(table(conds)>1)){ # if few replicates
            method = 'pooled'               # use all groups with replicates to estimate the variance
            sharingMode = 'maximum'         # use the max of the GLM fit and the estimated variance
        } else {                         # if no replicates
            method = 'blind'                # pools all groups together to estimate the variance
            sharingMode='fit-only'          # use only the GLM fit across the pooled variance
        }
        cds <- newCountDataSet(Mdata, conds)
        cds <- estimateSizeFactors(cds)
        test = try({
            cds <- estimateDispersions(cds, method=method, fitType='parametric', sharingMode=sharingMode)
        })
        if(class(test) == "try-error") {
            cds <- estimateDispersions(cds, method=method, fitType='local', sharingMode=sharingMode)
        }
        """)

        groups = list(set(colnames))
        couples = itertools.combinations(groups, 2)
        output = self.temporary_path(fname='DE')
        for c in couples:
            out = output + '_' + c[0] + '-' + c[1] + '.txt'
            r_cmd = """
            res <- nbinomTest(cds, '%s', '%s')
            res <- res[order(res[,8]),]
            write.table(res, '%s', row.names=F, quote=F, sep='\t')
            """ % (c[0], c[1], out)
            robjects.r(r_cmd)
            if kw.get('complete') is None:
                clean = self.clean_deseq_output(out,c)
                shutil.move(clean,out)
            self.new_file(out, 'differential_expression')
        return self.display_time()
Navigation

Source code for bsPlugins.DESeq

Other BBCF projects

Quick search

Navigation