Source code for iCount.analysis.rnamaps

""".. Line to protect from pydocstyle D205, D400.

RNA maps
--------

Perform RNA-maps analysis.
"""
import csv
import logging
import os

import pandas as pd
from pybedtools import BedTool

from iCount.genomes.constants import RNAMAP_TYPES
import iCount  # pylint: disable=wrong-import-position

LOGGER = logging.getLogger(__name__)


[docs]def get_single_type_landmarks(landmarks, maptype): """Get file with landmarks of only certain type.""" single_type_landmarks = BedTool(landmarks).filter(lambda x: x.name.split(';')[0] == maptype).saveas() return single_type_landmarks.fn
[docs]def compute_distances(landmarks, sites, maptype): """Compute distances between each xlink and it's closest landmark.""" # pylint: disable=too-many-function-args,unexpected-keyword-arg closest = BedTool(sites).closest( # pylint: disable=assignment-from-no-return landmarks, s=True, t='first', D='a', nonamecheck=True, ) # pylint: enable=too-many-function-args,unexpected-keyword-arg distances = {} total_cdna = 0 for mseg in closest: # "mseg" means merged segment, since it consists of 3 parts: # sites segment (BED6) + landmark segment (BED6) + distance if len(mseg.fields) != 13: LOGGER.warning('Segment length shoud be 13, not %s. Segment fields: %s', str(len(mseg)), str(mseg.fields)) score = int(mseg[4]) chrom = mseg[6] pos = mseg[7] gene_name = mseg[9] strand = mseg[11] distance = -int(mseg[-1]) total_cdna += score mapdata = RNAMAP_TYPES[maptype] if not -mapdata['upstream-size-limit'] <= distance <= mapdata['downstream-size-limit']: continue if chrom == '.' or strand not in ['+', '-'] or '-' in pos: continue # loc = Landmark "ID" = landmark exact coordinates loc = '{}__{}__{}__{}'.format(chrom, strand, pos, gene_name) distances.setdefault(loc, {})[distance] = distances.get(loc, {}).get(distance, 0) + score return distances, total_cdna
[docs]def make_results_raw_file(distances, fname, total_cdna, maptype): """Write distances data to file.""" up_limit = -RNAMAP_TYPES[maptype]['upstream-size-limit'] down_limit = RNAMAP_TYPES[maptype]['downstream-size-limit'] header = list(range(up_limit, down_limit + 1)) with open(fname, 'wt') as handle: outfile = csv.writer(handle, delimiter='\t') outfile.writerow(['total_cdna:{}'.format(total_cdna)]) outfile.writerow(['.'] + header) for loc, positions in sorted(distances.items()): outfile.writerow([loc] + [positions.get(pos, 0) for pos in header])
[docs]def make_results_summarised_file(outdir, fname): """Write "plot data" to file.""" data = {} up_limit = -max([mtyp['upstream-size-limit'] for mtyp in RNAMAP_TYPES.values()]) down_limit = max([mtyp['downstream-size-limit'] for mtyp in RNAMAP_TYPES.values()]) header = list(range(up_limit, down_limit + 1)) for basename in [file_ for file_ in os.listdir(outdir) if file_.endswith('.tsv')]: results_file = os.path.join(outdir, basename) maptype = iCount.plotting.rnamap.guess_maptype(results_file) plot_data, _ = iCount.plotting.rnamap.parse_results(results_file) # Make date of the same size, impute with 0 score on locations with no data. data[maptype] = [plot_data.get(pos, 0) for pos in header] assert len(header) == len(data[maptype]) dframe = pd.DataFrame.from_dict(data, orient='index', columns=header) dframe.to_csv(fname, sep='\t')
[docs]def run(sites, landmarks, outdir=None, plot_type='combined', top_n=100, smoothing=1, nbins=50, binsize=None, colormap='Greys', imgfmt='png', ): """ Compute distribution of cross-links relative to genomic landmarks. Parameters ---------- sites : str Croslinks file (BED6 format). Should be sorted by coordinate. landmarks : str Landmark file (landmarks.bed.gz) that is produced by ``iCount segment``. outdir : str Output directory. plot_type : str What kind of plot to make. Choices are distribution, heatmaps and combined. top_n : int Plot heatmap for top_n best covered landmarks. smoothing : int Smoothing half-window. Average smoothing is used. nbins : int Number of bins. Either nbins or binsize can be defined, but not both. binsize : int Bin size. Either nbins or binsize can be defined, but not both. colormap : str Colormap to use. Any matplotlib colormap can be used. imgfmt : str Output image format. Returns ------- None """ iCount.logger.log_inputs(LOGGER) assert plot_type in ['distribution', 'heatmap', 'combined'] sites_name = iCount.files.remove_extension(sites, ['.bed', '.bed.gz']) if outdir is None: outdir = os.path.join(os.path.abspath(os.getcwd()), 'rnamaps_{}'.format(sites_name)) LOGGER.info('Output directory not given, creating one at %s', outdir) os.makedirs(outdir, exist_ok=True) for maptype, mapdata in RNAMAP_TYPES.items(): LOGGER.info('Creating landmarks for %s', maptype) landmarks_single_type = get_single_type_landmarks(landmarks, maptype) if not BedTool(landmarks_single_type).head(n=1, as_string=True): LOGGER.info('No landmarks for %s', maptype) continue LOGGER.info('Processing data for %s', maptype) distances, total_cdna = compute_distances(landmarks_single_type, sites, maptype) if not distances: LOGGER.warning('No distances for %s', maptype) continue LOGGER.info('Writing results to file for %s', maptype) # File with full set of results: results_raw_file = os.path.join(outdir, '{}_{}.tsv'.format(sites_name, maptype)) make_results_raw_file(distances, results_raw_file, total_cdna, maptype) kwargs = { 'fname': results_raw_file, 'outfile': os.path.join(outdir, '{}_{}.{}'.format(sites_name, maptype, imgfmt)), 'up_limit': mapdata.get('upstream-plot-width', mapdata['upstream-size-limit']), 'down_limit': mapdata.get('downstream-plot-width', mapdata['downstream-size-limit']), 'top_n': top_n, 'smoothing': smoothing, 'nbins': nbins, 'binsize': binsize, 'colormap': colormap, } if plot_type == 'distribution': kwargs['fnames'] = kwargs['fname'] for key in ['top_n', 'nbins', 'binsize', 'colormap', 'fname']: del kwargs[key] iCount.plotting.rnamap.plot_rnamap(**kwargs) # pylint: disable=unexpected-keyword-arg elif plot_type == 'heatmap': del kwargs['smoothing'] iCount.plotting.rnaheatmap.plot_rnaheatmap(**kwargs) # pylint: disable=unexpected-keyword-arg elif plot_type == 'combined': iCount.plotting.rnacombined.plot_combined(**kwargs) # Single file with only RNA-maps distibution plot data. results_summarised_file = os.path.join(outdir, '{}_plot_data.tsv'.format(sites_name)) make_results_summarised_file(outdir, results_summarised_file) LOGGER.info('Done.')