Source code for iCount.files.bed
""".. Line to protect from pydocstyle D205, D400.
BED
---
Reading and writing `BED`_ files.
"""
import os
import shutil
import logging
import tempfile
import pybedtools
import iCount
LOGGER = logging.getLogger(__name__)
def _convert_legacy_bed_format(feature):
"""
TODO.
Old iCount legacy format:
chrome, start, end, [+-]score
where +/- indicate strand of cross-link site and score indicates the
intensity of interaction
use BED6 format, see:
http://bedtools.readthedocs.io/en/latest/content/general-usage.html
"""
chrom = feature.chrom
start = feature.start
end = feature.stop
name = '.'
if feature.name[0] == '-' or feature.name[0] == '+':
score = feature.name[1:]
strand = feature.name[0]
else:
score = feature.name
strand = '+'
return pybedtools.create_interval_from_list(
[chrom, start, end, name, score, strand],
)
[docs]def convert_legacy(bedgraph_legacy, bed_converted):
"""
Convert legacy iCount's four-column format into proper BED6 format.
Old iCount legacy format: chrome, start, end, [+-]value
Strand can be either '+' or '-', and value indicates the intensity of
interaction.
The returned BED file follows the BED6 format, as explained in the
[bedtools manual](http://bedtools.readthedocs.io/en/latest/content
/general-usage.html).
"""
sites = pybedtools.BedTool(bedgraph_legacy).sort().saveas()
sites1 = sites.each(_convert_legacy_bed_format).saveas(bed_converted)
return sites1
[docs]def merge_bed(sites_grouped, sites):
"""
Merge multiple files with crosslinks into one.
Concatenate files into one file. Also, merge crosslinks from different files
that are on same position and sum their scores.
Parameters
----------
sites_grouped : str
Path to output BED6 file containing merged data from input sites files.
sites : list_str
List of BED6 files(paths) to be merged.
Returns
-------
str
Absolute path to outfile.
"""
iCount.log_inputs(LOGGER, level=logging.INFO)
if not sites:
raise ValueError(
"At least one element expected in files list, but none found.")
LOGGER.info('Reading input files...')
joined = tempfile.NamedTemporaryFile(mode='at', delete=False)
for file_path in sites:
if not os.path.isfile(file_path):
raise ValueError("File {} not found.".format(file_path))
with iCount.files.gz_open(file_path, 'rt') as infile:
shutil.copyfileobj(infile, joined)
joined.close()
# Marge intervals in "joined" file (needs to be sorted before!):
# s=True - only merge features that are on the same strand
# d=-1 - join only intervals with at least one base-pair overlap - default
# (0) merges also touching intervals
# c=5, o='sum' - when merging intervals, make operation 'sum' on column 5 (score)
LOGGER.info('Merging files...')
merged = pybedtools.BedTool(joined.name).sort().merge(
s=True, d=-1, c=(5, 6), o=('sum', 'distinct')).sort().saveas()
# Columns are now shuffled to: chrom-start-stop-strand-score
# Reorder to: chrom-start-stop-empty_name-score-strand
# which corresponds to BED6
LOGGER.info('Saving results...')
result = pybedtools.BedTool(pybedtools.create_interval_from_list(
i[:3] + ['.'] + i[3:]) for i in merged).saveas()
result.saveas(sites_grouped)
LOGGER.info('Done. Results saved to: %s', os.path.abspath(result.fn))
return os.path.abspath(result.fn)