Source code for binning

"""
A Python implementation of the interval binning scheme.

These are some utility functions for working with the interval binning scheme
as used in the `UCSC Genome Browser
<http://genome.cshlp.org/content/12/6/996.full>`_. This scheme can be used to
implement fast overlap-based querying of intervals, essentially mimicking an
`R-tree <https://en.wikipedia.org/wiki/R-tree>`_ index.

Note that some database systems natively support spatial index methods such as
R-trees. See for example the `PostGIS <http://postgis.net/>`_ extension for
PostgreSQL.

Although in principle the method can be used for binning any kind of
intervals, be aware that the largest position supported by this implementation
is 2^29 (which covers the longest human chromosome).

All positions and ranges in this module are zero-based and open-ended,
following standard Python indexing and slicing.

.. moduleauthor:: Martijn Vermaat <martijn@vermaat.name>

.. Licensed under the MIT license, see the LICENSE.rst file.
"""


# TODO: Implement the extended binning scheme (for positions > 2^29).
# TODO: Be more flexible in the binning scheme to use.


from __future__ import unicode_literals

from itertools import dropwhile


__version_info__ = ('1', '0', '1', 'dev')
__date__ = '28 Oct 2015'


__version__ = '.'.join(__version_info__)
__author__ = 'Martijn Vermaat'
__contact__ = 'martijn@vermaat.name'
__homepage__ = 'https://github.com/martijnvermaat/binning'


# Standard scheme used by the UCSC Genome Browser.
BIN_OFFSETS = [512 + 64 + 8 + 1, 64 + 8 + 1, 8 + 1, 1, 0]
SHIFT_FIRST = 17
SHIFT_NEXT = 3
MAX_POSITION = pow(2, 29) - 1
MAX_BIN = BIN_OFFSETS[0] + (MAX_POSITION >> SHIFT_FIRST)


class OutOfRangeError(Exception):
    """
    Exception that is raised on seeing an invalid bin number or a position or
    interval exceeding the range of the binning scheme.
    """
    pass


def range_per_level(start, stop):
    """
    Given an interval `start:stop`, make an iterator that returns for each
    level the first and last bin overlapping the interval, starting with the
    smallest bins.

    Zero-length intervals are binned according to the one-position interval
    following it.

    Algorithm by `Jim Kent
    <http://genomewiki.ucsc.edu/index.php/Bin_indexing_system>`_.

    :arg int start, stop: Interval positions (zero-based, open-ended).

    :return: Iterator yielding tuples `first, last` being the first and last
      bin overlapping with `start:stop`. The tuples are ordered according to
      the bin size of the levels, starting with the smallest bins.
    :rtype: iterator(tuple(int, int))

    :raise OutOfRangeError: If `start:stop` exceeds the range of the binning
      scheme.
    """
    if start < 0 or stop > MAX_POSITION + 1:
        raise OutOfRangeError(
            'Interval %d:%d is out of range (maximum position is %d)'
            % (start, stop, MAX_POSITION))

    # Note that we treat the zero-length interval `x:x` as `x:x+1`.
    start_bin = start
    stop_bin = max(start, stop - 1)

    start_bin >>= SHIFT_FIRST
    stop_bin >>= SHIFT_FIRST

    for offset in BIN_OFFSETS:
        yield offset + start_bin, offset + stop_bin
        start_bin >>= SHIFT_NEXT
        stop_bin >>= SHIFT_NEXT


[docs]def assign_bin(start, stop): """ Given an interval `start:stop`, return the smallest bin in which it fits. :arg int start, stop: Interval positions (zero-based, open-ended). :return: Smallest bin containing `start:stop`. :rtype: int :raise OutOfRangeError: If `start:stop` exceeds the range of the binning scheme. """ try: return next(dropwhile(lambda r: r[0] != r[1], range_per_level(start, stop)))[0] except StopIteration: raise Exception('An unexpected error occured in assigning a bin')
[docs]def overlapping_bins(start, stop=None): """ Given an interval `start:stop`, return bins for intervals *overlapping* `start:stop` by at least one position. The order is according to the bin level (starting with the smallest bins), and within a level according to the bin number (ascending). :arg int start, stop: Interval positions (zero-based, open-ended). If `stop` is not provided, the interval is assumed to be of length 1 (equivalent to `stop = start + 1`). :return: All bins for intervals overlapping `start:stop`, ordered first according to bin level (ascending) and then according to bin number (ascending). :rtype: list(int) :raise OutOfRangeError: If `start:stop` exceeds the range of the binning scheme. """ if stop is None: stop = start + 1 return [bin for first, last in range_per_level(start, stop) for bin in range(first, last + 1)]
[docs]def containing_bins(start, stop=None): """ Given an interval `start:stop`, return bins for intervals completely *containing* `start:stop`. The order is according to the bin level (starting with the smallest bins), and within a level according to the bin number (ascending). :arg int start, stop: Interval positions (zero-based, open-ended). If `stop` is not provided, the interval is assumed to be of length 1 (equivalent to `stop = start + 1`). :return: All bins for intervals containing `start:stop`, ordered first according to bin level (ascending) and then according to bin number (ascending). :rtype: list(int) :raise OutOfRangeError: If `start:stop` exceeds the range of the binning scheme. """ if stop is None: stop = start + 1 max_bin = assign_bin(start, stop) return [bin for bin in overlapping_bins(start, stop) if bin <= max_bin]
[docs]def contained_bins(start, stop=None): """ Given an interval `start:stop`, return bins for intervals completely *contained by* `start:stop`. The order is according to the bin level (starting with the smallest bins), and within a level according to the bin number (ascending). :arg int start, stop: Interval positions (zero-based, open-ended). If `stop` is not provided, the interval is assumed to be of length 1 (equivalent to `stop = start + 1`). :return: All bins for intervals contained by `start:stop`, ordered first according to bin level (ascending) and then according to bin number (ascending). :rtype: list(int) :raise OutOfRangeError: If `start:stop` exceeds the range of the binning scheme. """ if stop is None: stop = start + 1 min_bin = assign_bin(start, stop) return [bin for bin in overlapping_bins(start, stop) if bin >= min_bin]
[docs]def covered_interval(bin): """ Given a bin number `bin`, return the interval covered by this bin. :arg int bin: Bin number. :return: Tuple of `start, stop` being the zero-based, open-ended interval covered by `bin`. :rtype: tuple(int) :raise OutOfRangeError: If bin number `bin` exceeds the maximum bin number. """ if bin < 0 or bin > MAX_BIN: raise OutOfRangeError( 'Invalid bin number %d (maximum bin number is %d)' % (bin, MAX_BIN)) shift = SHIFT_FIRST for offset in BIN_OFFSETS: if offset <= bin: return bin - offset << shift, bin + 1 - offset << shift shift += SHIFT_NEXT