Source code for iCount.analysis.annotate

""".. Line to protect from pydocstyle D205, D400.

Cross-link site annotation
--------------------------

Annotate each cross link site with types of regions that intersect with it.

"""
import re
import os
import logging

import pybedtools
from pybedtools import create_interval_from_list

import iCount

LOGGER = logging.getLogger(__name__)


[docs]def annotate_cross_links(annotation, sites, sites_annotated, subtype='biotype',
                         excluded_types=None):
    """
    Annotate each cross-link site with all region types that intersect it.

    Each cross link can overlap/intersect with many intervals in annotation
    file. Each of these intervals has a given type. Make a copy of cross
    link file and include all these types in 4th column.

    In the context of this report "type" equals to the combination of 3rd column
    and attribute `subtype` from annotation file (GTF). Regions/intervals are
    parts of transcript (UTR, CDS, introns...) that "non- intersectingly" span
    the whole transcript. However, since transcripts can overlap, also intervals
    belonging to different transcripts can overlap. Intergenic regions are also
    considered as region. Each region has one and only one type.

    Parameters
    ----------
    annotation : str
        Path to annotation file (should be GTF and include `subtype` attribute).
    sites : str
        Path to input BED6 file listing all cross-linked sites.
    sites_annotated : str
        Path to output BED6 file listing annotated cross-linked sites.
    subtype : str
        Subtype.
    excluded_types : list_str
        Excluded types.

    Returns
    -------
    str
        Path to summary report file (should be equal to out_file parameter)

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)
    metrics = iCount.Metrics()

    excluded_types = excluded_types or []
    cross_links = pybedtools.BedTool(sites).sort().saveas()
    annotation = pybedtools.BedTool(annotation).filter(
        lambda x: x[2] not in excluded_types).sort().saveas()

    LOGGER.info('Calculating overlaps between cross-link and annotation_file...')
    overlaps = cross_links.intersect(annotation, sorted=True, s=True, wb=True).saveas()
    try:
        # this will raise TypeError if overlaps is empty:
        overlaps[0]
    except (IndexError, TypeError):
        raise ValueError('No intersections found. This may be caused by '
                         'different naming of chromosomes in annotation and'
                         'cross_links file ("chr1" vs. "1")')

    data = []  # cotainer for final annotated BED file intervals
    site_types = []  # cotainer for all types intersecting with given cross-link
    previous_interval = overlaps[0]

    def finalize(types, site):
        """Make annotated (with all intersecting types) cross link interval."""
        data.append(create_interval_from_list(
            site[0:3] + ['; '.join(map(str, sorted(set(types))))] + site[4:6]))

    for interval in overlaps:
        # Detect new cross link:
        if interval.start != previous_interval.start or \
           interval.strand != previous_interval.strand:
            finalize(site_types, previous_interval)
            site_types = []
        if subtype:
            # Extract subtype attribute:
            stype = re.match(r'.*{} "(.*?)";'.format(subtype), interval[-1])
            site_types.append('{} {}'.format(interval[8], stype.group(1) if stype else '.'))
        else:
            site_types.append(interval[8])
        previous_interval = interval
    finalize(site_types, previous_interval)

    # Produce annotated cross-link file:
    LOGGER.info('Writing results to file...')
    # To save with .gz compression, file has to first be saved to tmp_dir and
    # saved to filename with .gz extension later. Otherwise, file has gzip
    # extension, but it is not actually gzipped.
    tmp_ann = pybedtools.BedTool(line for line in data).saveas()
    tmp_ann.saveas(sites_annotated)

    LOGGER.info('Done. Output saved to: %s', os.path.abspath(sites_annotated))
    return metrics
iCount

Navigation

Related Topics

Source code for iCount.analysis.annotate