Source code for dxr.indexers

"""Base classes and convenience functions for writing indexers and skimmers"""

from collections import namedtuple
from operator import itemgetter
from os.path import join, islink
from warnings import warn

from funcy import group_by, decorator, imapcat

from dxr.utils import build_offset_map, split_content_lines


STRING_PROPERTY = {
    'type': 'string',
    'index': 'not_analyzed',
    'fields': {
        'lower': {  # for qualified_type direct searcher
            'type': 'string',
            'analyzer': 'lowercase'
        }
    }
}


# An unanalyzed string property that points to a value that can be exact- or
# prefix-matched against and carries start/end bounds for highlighting. Has
# both a name and a qualname.
QUALIFIED_FILE_NEEDLE = {
    'type': 'object',
    'properties': {
        'name': STRING_PROPERTY,
        'qualname': STRING_PROPERTY,
    }
}

QUALIFIED_LINE_NEEDLE = {
    'type': 'object',
    'properties': {
        'name': STRING_PROPERTY,
        # The clang plugin stores both type-distinguished and merely scoped
        # names here: both "Thing::foo(int num)" and "Thing::foo". Thus, the
        # value may be either a string or a list:
        'qualname': STRING_PROPERTY,
        'start': {
            'type': 'integer',
            'index': 'no'  # just for highlighting
        },
        'end': {
            'type': 'integer',
            'index': 'no'
        }
    }
}


class PluginConfig(object):
    """Mixin providing access to the plugin-specific configuration of a tree

    Expects ``plugin_name`` and ``tree`` instance attrs.

    """
    @property
    def plugin_config(self):
        """Return a mapping of plugin-specific config options."""
        return getattr(self.tree, self.plugin_name)


[docs]class FolderToIndex(PluginConfig):
    """The FolderToIndex generates needles for folders and provides an
    optional list of headers to display in browse view as `browse_headers`."""
    browse_headers = []

    def __init__(self, plugin_name, tree, path):
        self.plugin_name = plugin_name
        self.tree = tree
        self.path = path

    def needles(self):
        return []


[docs]class TreeToIndex(PluginConfig):
    """A TreeToIndex performs build environment setup and teardown and serves
    as a repository for scratch data that should persist across an entire
    indexing run.

    Instances must be pickleable so as to make the journey to worker processes.
    You might also want to keep the size down. It takes on the order of 2s for
    a 150MB pickle to make its way across process boundaries, including
    pickling and unpickling time. For this reason, we send the TreeToIndex once
    and then have it index several files before sending it again.

    """
    def __init__(self, plugin_name, tree, vcs_cache):
        """
        :arg tree: The configuration of the tree to index: a TreeConfig
        :arg vcs_cache: A :class:`~dxr.vcs.VcsCache` that describes any VCSes
            used by this tree. May be None if tree does not contain any VCS
            repositories.

        """
        # We need source_folder, object_folder, temp_folder, and maybe
        # ignore_filenames out of the tree.
        self.plugin_name = plugin_name
        self.tree = tree
        self.vcs_cache = vcs_cache

[docs]    def environment(self, vars):
        """Return environment variables to add to the build environment.

        This is where the environment is commonly twiddled to activate and
        parametrize compiler plugins which dump analysis data.

        :arg vars: A dict of the already-set variables. You can make decisions
            based on these.

        You may return a new dict or scribble on ``vars`` and return it. In
        either case, the returned dict is merged into those from other plugins,
        with later plugins taking precedence in case of conflicting keys.

        """
        return vars

[docs]    def pre_build(self):
        """Hook called before the tree's build command is run

        This is a good place to make a temp folder to dump said data in. You
        can stash away a reference to it on me so later methods can find it.

        """

[docs]    def post_build(self):
        """Hook called after the tree's build command completes

        This is a good place to do any whole-program analysis, storing it on
        me or on disk.

        """

[docs]    def file_to_index(self, path, contents):
        """Return an object that provides data about a given file.

        Return an object conforming to the interface of :class:`FileToIndex`,
        generally a subclass of it.

        :arg path: A path to the file to index, relative to the tree's source
            folder
        :arg contents: What's in the file: unicode if we managed to guess an
            encoding and decode it, None otherwise

        Return None if there is no indexing to do on the file.

        Being a method on TreeToIndex, this can easily pass along the location
        of a temp directory or other shared setup artifacts. However, beware
        of passing mutable things; while the FileToIndex can mutate them,
        visibility of those changes will be limited to objects in the same
        worker process. Thus, a TreeToIndex-dwelling dict might be a suitable
        place for a cache but unsuitable for data that can't evaporate.

        If a plugin omits a TreeToIndex class,
        :meth:`~dxr.plugins.Plugin.from_namespace()` constructs one
        dynamically. The method implementations of that class are inherited
        from this class, with one exception: a ``file_to_index()`` method is
        dynamically constructed which returns a new instance of the
        ``FileToIndex`` class the plugin defines, if any.

        """

    # This is probably the place to add whatever_indexer()s for other kinds of
    # things, like modules, if we ever wanted to support some other view of
    # search results than files or lines, like a D3 diagram of an inheritance
    # hierarchy or call graph. We'd need to come up with some way of looping
    # around those modules to let various plugins contribute. Perhaps we'd
    # introduce another kind of plugin: an enumerator.


[docs]class FileToSkim(PluginConfig):
    """A source of rendering data about a file, generated at request time

    This is appropriate for unindexed files (such as old revisions pulled out
    of a VCS) or for data so large or cheap to produce that it's a bad tradeoff
    to store it in the index. An instance of me is mostly an opportunity for a
    shared cache among my methods.

    """
    def __init__(self, path, contents, plugin_name, tree, file_properties=None,
                 line_properties=None):
        """
        :arg path: The (bytestring) conceptual path to the file, relative to
            the tree's source folder. Such a file might not exist on disk. This
            is useful mostly as a hint for syntax coloring.
        :arg contents: What's in the file: unicode if we knew or successfully
            guessed an encoding, None otherwise. Don't return any by-line data
            for None; the framework won't have succeeded in breaking up the
            file by line for display, so there will be no useful UI for those
            data to support. In fact, most skimmers won't be be able to do
            anything useful with None at all. For unicode, split the file into
            lines using universal newlines
            (``dxr.utils.split_content_lines()``); that's what the rest of the
            framework expects.
        :arg tree: The :class:`~dxr.config.TreeConfig` of the tree to which
            the file belongs

        If the file is indexed, there will also be...

        :arg file_properties: Dict of file-wide needles emitted by the indexer
        :arg line_properties: List of per-line needle dicts emitted by the
            indexer

        """
        self.path = path
        self.contents = contents
        self.plugin_name = plugin_name
        self.tree = tree
        self.file_properties = file_properties or {}
        self.line_properties = line_properties  # TODO: not clear what the default here should be. repeat([])?

[docs]    def is_interesting(self):
        """Return whether it's worthwhile to examine this file.

        For example, if this class knows about how to analyze JS files, return
        True only if ``self.path.endswith('.js')``. If something falsy is
        returned, the framework won't call data-producing methods like
        :meth:`~dxr.indexers.FileToSkim.links()`,
        :meth:`~dxr.indexers.FileToSkim.refs()`, etc.

        The default implementation selects only text files that are not symlinks.
        Note: even if a plugin decides that symlinks are interesting, it should
        remember that links, refs, regions and by-line annotations will not be
        called because views of symlinks redirect to the original file.

        """
        return self.contains_text() and not self.is_link()

[docs]    def links(self):
        """Return an iterable of links for the navigation pane::

            (sort order, heading, [(icon, title, href), ...])

            File views will replace any {{line}} within the href with the
            last-selected line number.

        """
        return []

[docs]    def refs(self):
        """Provide cross references for various spans of text, accessed
        through a context menu.

        Yield an ordered list of extents and menu items::

            (start, end, ref)

        ``start`` and ``end`` are the bounds of a slice of a Unicode string
        holding the contents of the file. (``refs()`` will not be called for
        binary files.)

        ``ref`` is a :class:`~dxr.lines.Ref`.

        """
        return []

[docs]    def regions(self):
        """Yield instructions for syntax coloring and other inline formatting
        of code.

        Yield an ordered list of extents and CSS classes (encapsulated in
        :class:`~dxr.lines.Region` instances)::

            (start, end, Region)

        ``start`` and ``end`` are the bounds of a slice of a Unicode string
        holding the contents of the file. (``regions()`` will not be called
        for binary files.)

        """
        return []

[docs]    def annotations_by_line(self):
        """Yield extra user-readable information about each line, hidden by
        default: compiler warnings that occurred there, for example.

        Yield a list of annotation maps for each line::

            {'title': ..., 'class': ..., 'style': ...}

        """
        # TODO: Why are these just per line? Shouldn't they return extents like
        # everybody else? We can still show them per line if we want.
        return []

    # Convenience methods:

[docs]    def contains_text(self):
        """Return whether this file can be decoded and divided into lines as
        text. Empty files contain text.

        This may come in handy as a component of your own
        :meth:`~dxr.indexers.FileToSkim.is_interesting()` methods.

        """
        return isinstance(self.contents, unicode)

[docs]    def char_offset(self, row, col):
        """Return the from-BOF unicode char offset for the char at the given
        row and column of the file we're indexing.

        This is handy for translating row- and column-oriented input to the
        format :meth:`~dxr.indexers.FileToSkim.refs()` and
        :meth:`~dxr.indexers.FileToSkim.regions()` want.

        :arg row: The 1-based line number, according to splitting in universal
            newline mode
        :arg col: The 0-based column number

        """
        return self._line_offsets()[row - 1] + col

    # Convenience methods:

[docs]    def absolute_path(self):
        """Return the (bytestring) absolute path of the file to skim.

        Note: in skimmers, the returned path may not exist if the source folder
        moved between index and serve time.

        """
        return join(self.tree.source_folder, self.path)

[docs]    def is_link(self):
        """Return whether the file is a symlink.

        Note: symlinks are never displayed in file browsing; a request for a symlink redirects
        to its target.

        """
        return islink(self.absolute_path())

    # Private methods:

    def _line_offsets(self):
        """Return (and cache) a list mapping 1-based line numbers to from-BOF
        Unicode offsets."""
        if not hasattr(self, '_line_offset_list'):
            if not self.contains_text():
                raise ValueError("Can't get line offsets for a file that isn't"
                                 " text.")
            lines = split_content_lines(self.contents) if self.contents is not None else []
            self._line_offset_list = build_offset_map(lines)
        return self._line_offset_list


[docs]class FileToIndex(FileToSkim):
    """A source of search and rendering data about one source file"""

    def __init__(self, path, contents, plugin_name, tree):
        """Analyze a file or digest an analysis that happened at compile time.

        :arg path: The (bytestring) path to the file to index, relative to the
            tree's source folder
        :arg contents: What's in the file: unicode if we managed to guess at an
            encoding and decode it, None otherwise. Don't return any by-line
            data for None; the framework won't have succeeded in breaking up
            the file by line for display, so there will be no useful UI for
            those data to support. Think more along the lines of returning
            EXIF data to search by for a JPEG. For unicode, split the file into
            lines using universal newlines
            (``dxr.utils.split_content_lines()``); that's what the rest of the
            framework expects.
        :arg tree: The :class:`~dxr.config.TreeConfig` of the tree to which
            the file belongs

        Initialization-time analysis results may be socked away on an instance
        var. You can think of this constructor as a per-file post-build step.
        You could do this in a different method, using memoization, but doing
        it here makes for less code and less opportunity for error.

        FileToIndex classes of plugins may take whatever constructor args they
        like; it is the responsibility of their TreeToIndex objects'
        :meth:`~dxr.indexers.TreeToIndex.file_to_index()` methods to supply
        them. However, the ``path`` and ``contents`` instance vars should be
        initialized and have the above semantics, or a lot of the provided
        convenience methods and default implementations will break.

        """
        # We receive the file contents from the outside for two reasons: (1) so
        # we don't repeatedly redo the encoding guessing (which involves
        # iterating over potentially the whole file looking for nulls) and (2)
        # for symmetry with FileToSkim, so we can share many method
        # implementations.
        super(FileToIndex, self).__init__(path, contents, plugin_name, tree)

[docs]    def needles(self):
        """Return an iterable of key-value pairs of search data about the file
        as a whole: for example, modification date or file size.

        Each pair becomes an elasticsearch property and its value. If the
        framework encounters multiple needles of the same key (whether coming
        from the same plugin or different ones), all unique values will be
        retained using an elasticsearch array.

        """
        # We go with pairs rather than a map so we can just chain all these
        # together and pass them to a dict constructor: fewer temp vars.
        return []

[docs]    def needles_by_line(self):
        """Return per-line search data for one file: for example, markers that
        indicate a function called "foo" is defined on a certain line.

        Yield an iterable of key-value pairs for each of a file's lines, one
        iterable per line, in order. The data might be data to search on or
        data stowed away for a later realtime thing to generate refs or
        regions from. In any case, each pair becomes an elasticsearch property
        and its value.

        If the framework encounters multiple needles of the same key on the
        same line (whether coming from the same plugin or different ones), all
        unique values will be retained using an elasticsearch array. Values
        may be dicts, in which case common keys get merged by
        :func:`~dxr.utils.append_update()`.

        This method is not called on symlink files, to maintain the illusion
        that they do not have contents, seeing as they cannot be viewed in
        file browsing.

        """
        return []


# Conveniences:


Extent = namedtuple('Extent', ['start', 'end'])  # 0-based
Position = namedtuple('Position', ['row', 'col'])  # col 0-based, row 1-based


class FuncSig(namedtuple('FuncSig', ['inputs', 'output'])):
    def __str__(self):
        return '{0} -> {1}'.format(
            tuple(self.inputs), self.output).replace("'", '').replace('"', '')


@decorator
def unsparsify(call):
    """Transform a sparse needle list [(key, val, span:Extent)] into the
    line-by-line format needles_by_line expects: [[(key, val)]].

    """
    return group_needles(by_line(call()))


# Deprecated in favor of iterable_per_line()
def group_needles(line_needles):
    """Group line needles by line, and return a list of needles for each line,
    up to the last line with any needles::

        [(a, 1), (b, 4), (c, 4)] -> [[a], [], [], [b, c]]

    """
    # Jam all the needles of a file into a hash by line number:
    line_map = group_by(itemgetter(1), line_needles)  # {line: needles}
    last_line = max(line_map.iterkeys()) + 1 if line_map else 1

    # Pull out the needles for each line, stripping off the line number
    # elements of the tuples and producing a blank list for missing lines.
    # (The defaultdict returned from group_by takes care of the latter.)
    return [[pair for (pair, _) in line_map[line_num]]
            for line_num in xrange(1, last_line)]


# Deprecated
def by_line(span_needles):
    """Transform [(_, span:Extent)] into [(_, line:int)].

    Converts spans to lines. The resulting iter will have len' >= len.

    """
    return ((key_object_pair(*kv_start_end), line_number) for
            kv_start_end, line_number in imapcat(span_to_lines, span_needles))


# Deprecated in favor of with_start_and_end()
def key_object_pair((k, v), start, end):
    """Transform a key/value pair, along with start and end columns, to a
    key/multi-propertied-object pair that can be stored in elasticsearch and
    then used to support searching and highlighting.

    """
    return k, {'value': v, 'start': start, 'end': end}


# Deprecated in favor of split_into_lines()
def span_to_lines((kv, span)):
    """Expand ((k, v), span:Extent) into [(((k, v), line_span), line:int)].

    line_span has shape: (col1, col2)

    """
    if span.end.row == span.start.row:
        yield (kv, span.start.col, span.end.col), span.start.row
    elif span.end.row < span.start.row:
        warn('Bad Extent: end.row < start.row: %s < %s' %
             (span.end.row, span.start.row))
    else:
        # TODO: There are a lot of Nones used as slice bounds below. Do we
        # ever translate them back into char offsets? If not, does the
        # highlighter or anything else choke on them?
        yield (kv, span.start.col, None), span.start.row

        # Really wish we could use yield from
        for row in xrange(span.start.row + 1, span.end.row):
            yield (kv, 0, None), row

        yield (kv, 0, span.end.col), span.end.row


def split_into_lines(triples):
    """Split a bunch of (key, mapping, extent) triples into more triples
    than those, with each one contained in a line.

    """
    def _split_one((key, mapping, extent)):
        """Split a single triple into one or more, each spanning at most one
        line.

        """
        if extent.end.row == extent.start.row:
            yield key, mapping, extent
        elif extent.end.row < extent.start.row:
            # This indicates a bug in an indexer plugin.
            warn('Bad extent: end.row < start.row: %s < %s' %
                 (extent.end.row, extent.start.row))
        else:
            # TODO: There are a lot of Nones used as slice bounds below. Do we
            # ever translate them back into char offsets? If not, does the
            # highlighter or anything else choke on them?
            yield key, mapping, Extent(Position(row=extent.start.row,
                                                col=extent.start.col),
                                       Position(row=extent.start.row,
                                                col=None))

            # Really wish we could use yield from
            for row in xrange(extent.start.row + 1, extent.end.row):
                yield key, mapping, Extent(Position(row=row,
                                                    col=0),
                                           Position(row=row,
                                                    col=None))

            yield key, mapping, Extent(Position(row=extent.end.row,
                                                col=0),
                                       Position(row=extent.end.row,
                                                col=extent.end.col))

    return imapcat(_split_one, triples)


def with_start_and_end(triples):
    """Add 'start' and 'end' column keys to the value mappings of one-line
    triples, and yield them back.

    """
    for key, mapping, extent in triples:
        mapping['start'] = extent.start.col
        mapping['end'] = extent.end.col
        yield key, mapping, extent


def iterable_per_line(triples):
    """Yield iterables of (key, value mapping), one for each line."""
    # Jam all the triples of a file into a hash by line number:
    line_map = group_by(lambda (k, v, extent): extent.start.row, triples)  # {line: triples}
    last_line = max(line_map.iterkeys()) + 1 if line_map else 1

    # Pull out the needles for each line, stripping off the extents and
    # producing a blank list for missing lines. (The defaultdict returned from
    # group_by takes care of the latter.)
    return [[(k, v) for (k, v, e) in line_map[line_num]]
            for line_num in xrange(1, last_line)]

    # If this has to be generic so we can use it on annotations_by_line as well, pass in a key function that extracts the line number and maybe another that constructs the return value.

def iterable_per_line_sorted(triples):
    """Yield iterables of (key, value mapping), one for each line, where triples are sorted already."""
    last_row = 1
    last_row_kvs = []
    for k, v, extent in triples:
        if extent.start.row == last_row:
            last_row_kvs.append((k, v))
        else:
            yield last_row_kvs
            # Yield empty lists for any skipped lines.
            for _ in xrange(last_row + 1, extent.start.row):
                yield []
            last_row_kvs = [(k, v)]
            last_row = extent.start.row
    # Emit anything on the last line.
    yield last_row_kvs