Source code for iCount.analysis.summary

""".. Line to protect from pydocstyle D205, D400.

Cross-link site summary
-----------------------

Report proportion of cross-link events/sites on each region type.
"""
import os
import re
import logging

import pybedtools
from pybedtools import create_interval_from_list

import iCount
from iCount.genomes.segment import _complement, _first_two_columns

LOGGER = logging.getLogger(__name__)


[docs]def make_types_length_file(annotation, fai, out_file=None, subtype='biotype', excluded_types=None):
    """
    Calculate the number of non-overlapping base pairs of each "type".

    In the context of this function "type" equals to the combination of 3rd
    column and attribute subtype from annotation file (GTF).


    Parameters
    ----------
    annotation : str
        Path to annotation GTF file (should include subtype attribute).
    out_file : str
        Path to output file (if None, name is set automatically).
    subtype : int
        Subtype.
    excluded_types : list_str
        Types listed in 3rd column of GTF to be exclude from analysis.

    Returns
    -------
    str
        Absolute path to out_file.

    """
    excluded_types = excluded_types or []
    ann_filtered = pybedtools.BedTool(annotation).filter(
        lambda x: x[2] not in excluded_types).sort().saveas()

    fai = _first_two_columns(fai)

    if out_file is None:
        match = re.match(r'([\w_]+\.\d+.).*', os.path.basename(annotation))
        if match:
            out_file = str(match.group(1)) + 'types_length.txt'
        else:
            out_file = annotation + 'types_length.txt'

    # Merge all annotation and take what is left = unannotated regions:
    unann_pos = _complement(ann_filtered.fn, fai, '+', type_name='unannotated')
    unann_neg = _complement(ann_filtered.fn, fai, '-', type_name='unannotated')
    ann_modified_file = out_file + 'modified'
    ann_joined = ann_filtered.cat(unann_pos, unann_neg, postmerge=False). \
        sort().saveas(ann_modified_file)

    data = {}
    for interval in ann_joined:
        if subtype:
            sbtyp = interval.attrs.get(subtype, None)
            type_ = ' '.join([interval[2], sbtyp] if sbtyp else [interval[2]])
        else:
            type_ = interval[2]
        if type_ not in data:
            data[type_] = []
        data[type_].append(create_interval_from_list(interval.fields))

    type_lengths = {}
    for type_, list_ in data.items():
        total_bp = 0
        # pylint: disable=unexpected-keyword-arg
        for feature in pybedtools.BedTool(i for i in list_).merge(s=True):
            total_bp += len(feature)
        type_lengths[type_] = total_bp

    # Write results to file:
    with open(out_file, 'wt') as outfile:
        for type_, length in sorted(type_lengths.items()):
            outfile.write('{}\t{}\n'.format(type_, length))

    return os.path.abspath(out_file), os.path.abspath(ann_modified_file)


[docs]def make_summary_report(annotation, sites, summary, fai, types_length_file=None, digits='8',
                        subtype=None, excluded_types=None):
    """
    Make summary report from cross-link and annotation data.

    In the context of this report "type" equals to the combination of 3rd
    column and attribute `subtype` from annotation file (GTF).
    "Regions" are parts of transcript (UTR, CDS, introns...) that "non-
    intersectingly" span the whole transcript. Intergenic regions are also
    considered as region. Each region has one and only one type.

    For each of such types, number of cross-link events/sites is counted.
    Sites = sum of *all* cross-link sites that are in regions of some type.
    Events = sum of col5 for *all* cross-link sites that are in regions of
    some type (multiple events can happen on each cross link). Col5 is
    numerical value in 5th column of cross-link file.

    Parameters
    ----------
    annotation : str
        Path to annotation GTF file (should include subtype attribute).
    sites : str
        Path to BED6 file listing cross-linked sites.
    summary : str
        Path to output tab-delimited file with summary statistics.
    fai : str
        Path to file with chromosome lengths.
    types_length_file : str
        Path to file with lengths of each type.
    digits : int
        Number of decimal places in results.
    subtype : str
        Name of attribute to be used as subtype.
    excluded_types : list_str
        Types listed in 3rd column of GTF to be exclude from analysis.

    Returns
    -------
    str
        Path to summary report file (equal to parameter out_file)

    """
    iCount.log_inputs(LOGGER, level=logging.INFO)
    metrics = iCount.Metrics()

    # If not given/present, make file with cumulative length for each type:
    if not types_length_file or not os.path.isfile(types_length_file):
        LOGGER.info('types_length_file not given - calculating it')
        types_length_file, annotation = make_types_length_file(
            annotation, fai, subtype=subtype, excluded_types=excluded_types)
    # read the file to dict, named type_lengths
    type_lengths = {}
    with open(types_length_file) as tfile:
        for line in tfile:
            parts = line.strip().split()
            type_lengths[' '.join(parts[:-1])] = int(parts[-1])

    # sorted=True - invokes memory efficient algorithm for large files
    # s=True - only report hits in B that overlap A on the same strand
    # wb=True - Write the original entry in B for each overlap
    cross_links = pybedtools.BedTool(sites).sort().saveas()
    annotation = pybedtools.BedTool(annotation).sort().saveas()
    LOGGER.info('Calculating intersection between cross-link and annotation...')
    overlaps = cross_links.intersect(annotation, sorted=True, s=True, wb=True).saveas()
    try:
        # this will raise TypeError if overlaps is empty:
        overlaps[0]
    except (IndexError, TypeError):
        raise ValueError('No intersections found. This may be caused by '
                         'different naming of chromosomes in annotation and '
                         'cross-links file (example: "chr1" vs. "1")')

    # dict structure = type_: [# of sites, # of events]
    type_counter = {type_: [0, 0] for type_ in type_lengths}
    site_types = []
    previous_segment = overlaps[0]

    def finalize(types, segment):
        """Increase counter for all types that intersect with segment."""
        for type_ in set(types):
            type_counter[type_][0] += 1  # sites
            type_counter[type_][1] += int(segment[4])  # events

    LOGGER.info('Extracting summary from data...')
    for segment in overlaps:
        # detect if segment contains new cross-link site:
        if segment.start != previous_segment.start or segment.strand != previous_segment.strand:
            finalize(site_types, previous_segment)
            site_types = []
        if subtype:
            # Extract subtype attribute:
            stype = re.match(r'.*{} "(.*)";'.format(subtype), segment[-1])
            stype = stype.group(1) if stype else None
            site_types.append(' '.join([segment[8], stype] if stype else [segment[8]]))
        else:
            site_types.append(segment[8])
        previous_segment = segment
    finalize(site_types, previous_segment)

    # Produce report file:
    header = ['type', 'length', 'length %', 'sites #', 'sites %',
              'sites enrichment', 'events #', 'events %', 'events enrichment']
    sum_sites = sum([i[0] for i in type_counter.values()])
    sum_events = sum([i[1] for i in type_counter.values()])

    # total genome len = sum_of_all_chrom_length * 2 (there are + and - strand):
    total_length = sum([int(line.strip().split()[1]) for line in
                        open(fai)]) * 2
    with open(summary, 'wt') as out:
        out.write('\t'.join(header) + '\n')
        for type_, [sites, events] in sorted(type_counter.items()):
            length_percent = type_lengths[type_] / total_length
            site_percent = sites / sum_sites
            site_enrichment = site_percent / length_percent
            event_percent = events / sum_events
            event_enrichment = event_percent / length_percent
            line = [type_, type_lengths[type_], length_percent,
                    sites, site_percent, site_enrichment,
                    events, event_percent, event_enrichment]
            line = line[:1] + [round(i, int(digits)) for i in line[1:]]
            out.write('\t'.join(map(str, line)) + '\n')

    LOGGER.info('Done. Results saved to: %s', os.path.abspath(summary))
    return metrics