Source code for bsPlugins.Intersections

from bsPlugins import *
from itertools import combinations
from bbcflib.gfminer.figure import venn
import os, tarfile


class IntersectionsForm(BaseForm):
    class SigMulti(twb.BsMultiple):
        label='Files: '
        files = twb.BsFileField(label=' ',
            help_text='Select signal files (e.g. bedgraph)',
            validator=twb.BsFileFieldValidator(required=True))
    column = twf.TextField(label='Column(s): ',
        prompt_text='1',
        value = 1,
        help_text='Column(s) number (1-based).')
    submit = twf.SubmitButton(id="submit", value="Submit")


meta = {'version': "1.0.0",
        'author': "BBCF",
        'contact': "webmaster-bbcf@epfl.ch"}

in_parameters = [{'id': 'files', 'type': 'track', 'multiple': 'SigMulti', 'required': True},
                 {'id': 'column', 'type': 'text'}]
out_parameters = [{'id': 'intersections', 'type': 'track'},
                  {'id': 'venn_diagram', 'type': 'file'}]


[docs]class IntersectionsPlugin(BasePlugin): """Returns the elements that are common to a set of text files, for instance the list of genes common to several lists of genes or annotation files. In the case when more that two files are given, all possible combinations of intersections are performed (2-by-2, 3-by-3, etc.), in the manner of a Venn diagram. If the elements to intersect are not in the first column, one can specify the column to consider by its index (first column is 1). Since the number of comparisons is approximately 2^(number of files), it is unadvised to compare more that a dozen of files (10 input files -> 2^10-11=1013 comparisons). The output is a compressed folder containing a summary file and a sub-folder with all the possible intersections, i.e. for each intersection one text file with the list of common elements. """ info = { 'title': 'Intersections', 'description': __doc__, 'path': ['Analysis', 'Intersections of lists'], 'output': IntersectionsForm, 'in': in_parameters, 'out': out_parameters, 'meta': meta, } def intersect(self, files_list, idx=0): common = set() for n,f in enumerate(files_list): col = set(line.strip().split()[idx] for line in open(f)) if n==0: common = col else: common &= col return common def compare(self, files_list, output, idx=0): if not os.path.exists(output): os.mkdir(output) counts = {} legend = {} summary = open(os.path.join(output,"summary.txt"), 'wb') summary.write("# Legend:\n") for i,f in enumerate(files_list): summary.write("%d\t%s\n" % (i,f)) legend[i] = f summary.write("\n### Files\tnb_elements\n") summary.write("\n# Self\n\n") for i,f in enumerate(files_list): nlines = len(open(f).readlines()) summary.write("%d\t%d\n" % (i,nlines)) counts[str(i)] = nlines for k in range(2,len(files_list)+1): summary.write("\n# %d-by-%d\n\n" % (k,k)) path = os.path.join(output,"%s-by-%s/" % (k,k)) if not os.path.exists(path): os.mkdir(path) combs = combinations(range(len(files_list)), k) for cb in combs: names = sorted([str(x) for x in cb]) name = "|".join(names) out = open(os.path.join(path,"%s.txt" % name), 'wb') common = self.intersect([files_list[i] for i in cb], idx) summary.write("%s\t%s\n" % (name,len(common))) counts[name] = len(common) for x in common: out.write(x+'\n') out.close() summary.close() return counts, legend def __call__(self,**kw): files_list = kw['SigMulti']['files'] column = int(kw['column'])-1 output = self.temporary_path(fname='intersections.') counts,legend = self.compare(files_list, output, column) # compress output_targz = self.temporary_path(fname=output+'tar.gz') tar = tarfile.open(output_targz, 'w:gz') tar.add(output) tar.close() self.new_file(output+'.tar.gz', 'intersections') if len(files_list) <= 4: # Venn diagram venn_format = 'png' venn_outname = self.temporary_path(fname='venn'+venn_format) venn(counts,legend=None,options={},output=venn_outname,format=venn_format) self.new_file(venn_outname, 'venn_diagram') return self.display_time()

Other BBCF projects