Source code for iCount.analysis.summary

""".. Line to protect from pydocstyle D205, D400.

Cross-link site summary
-----------------------

Report count of cross-link events in each region type.
"""
import logging
import math
import re
import os
import tempfile

from pybedtools import BedTool

import iCount
from iCount.genomes.region import (
    summary_templates, sort_types_subtypes, TEMPLATE_TYPE, TEMPLATE_SUBTYPE, TEMPLATE_GENE, SUMMARY_TYPE,
    SUMMARY_SUBTYPE, SUMMARY_GENE
)

LOGGER = logging.getLogger(__name__)


[docs]def summary_reports(annotation, sites, out_dir, templates_dir=None): """ Make summary reports for a cross-link file. Parameters ---------- annotation : str Annotation file (GTF format). It is recommended to use genome-level segmentation (e.g. regions.gtf.gz), that is produced by ``iCount segment`` command. sites : str Croslinks file (BED6 format). Should be sorted by coordinate. out_dir : str Output directory. templates_dir : str Directory containing templates for summary calculation. Made by ``iCount segment`` command. If this argument is not provided, summary templates are made on the fly. Returns ------- iCount.Metrics iCount Metrics object. """ iCount.log_inputs(LOGGER, level=logging.INFO) metrics = iCount.Metrics() if templates_dir is None: templates_dir = tempfile.mkdtemp() summary_templates(annotation, templates_dir) LOGGER.info('Calculating intersection between cross-link and annotation...') # pylint: disable=too-many-function-args,unexpected-keyword-arg overlaps = BedTool(sites).intersect( BedTool(annotation), sorted=True, # invokes memory efficient algorithm for large files s=True, # only report hits in B that overlap A on the same strand wb=True, # write the original entry in B for each overlap nonamecheck=True, # Do not print warnings about name inconsistency to stdout ).saveas() # pylint: enable=too-many-function-args,unexpected-keyword-arg try: overlaps[0] # will raise Error if overlaps is empty: except (IndexError, TypeError): raise ValueError('No intersections found. This may be caused by different naming of chromosomes in annotation' 'and cross-links file (example: "chr1" vs. "1")') type_counter, subtype_counter, gene_counter = {}, {}, {} LOGGER.info('Extracting summary data from intersection...') for segment in overlaps: score = int(segment.score) type_ = segment[8] type_counter[type_] = type_counter.get(type_, 0) + score biotype = re.match(r'.*biotype "(.*?)";', segment[-1]) biotype = biotype.group(1) if biotype else '' biotypes = biotype.split(',') for biotype in biotypes: sbtyp = iCount.genomes.region.make_subtype(type_, biotype) subtype_counter[sbtyp] = subtype_counter.get(sbtyp, 0) + score / len(biotypes) gene_id = re.match(r'.*gene_id "(.*?)";', segment[-1]) gene_id = gene_id.group(1) if gene_id else None gene_counter[gene_id] = gene_counter.get(gene_id, 0) + score sum_cdna = 0 for seg in BedTool(sites): sum_cdna += int(seg.score) def parse_template(template_file): """Parse template file.""" template = {} with open(template_file, 'rt') as ifile: for line in ifile: line = line.strip().split('\t') template[line[0]] = line[1:] return template LOGGER.info('Writing type report...') type_template = parse_template(os.path.join(templates_dir, TEMPLATE_TYPE)) with open(os.path.join(out_dir, SUMMARY_TYPE), 'wt') as out: header = ['Type', 'Length', 'cDNA #', 'cDNA %'] out.write('\t'.join(header) + '\n') for type_, cdna in sorted(type_counter.items(), key=lambda x: sort_types_subtypes(x[0])): line = [type_, type_template.get(type_, [-1])[0], math.floor(cdna), cdna / sum_cdna * 100] out.write('\t'.join(map(str, line)) + '\n') LOGGER.info('Writing subtype report...') subtype_template = parse_template(os.path.join(templates_dir, TEMPLATE_SUBTYPE)) with open(os.path.join(out_dir, SUMMARY_SUBTYPE), 'wt') as out: header = ['Subtype', 'Length', 'cDNA #', 'cDNA %'] out.write('\t'.join(header) + '\n') for stype, cdna in sorted(subtype_counter.items(), key=lambda x: sort_types_subtypes(x[0])): line = [stype, subtype_template.get(stype, [-1])[0], math.floor(cdna), cdna / sum_cdna * 100] out.write('\t'.join(map(str, line)) + '\n') LOGGER.info('Writing gene report...') gene_template = parse_template(os.path.join(templates_dir, TEMPLATE_GENE)) with open(os.path.join(out_dir, SUMMARY_GENE), 'wt') as out: header = ['Gene name (Gene ID)', 'Length', 'cDNA #', 'cDNA %'] out.write('\t'.join(header) + '\n') for gene_id, cdna in sorted(gene_counter.items()): gene_name, length = gene_template.get(gene_id, ['', -1]) if gene_id == '.': gene_name = 'intergenic' line = ['{} ({})'.format(gene_name, gene_id), length, math.floor(cdna), cdna / sum_cdna * 100] out.write('\t'.join(map(str, line)) + '\n') LOGGER.info('Done.') return metrics