Source code for dxr.lines

"""Machinery for interspersing lines of text with linked and colored regions

The typical entrypoints are es_lines() and html_line().

Within this file, "tag" means a tuple of (file-wide offset, is_start, payload).

"""
import cgi
from itertools import chain
try:
    from itertools import compress
except ImportError:
    from itertools import izip
    def compress(data, selectors):
        return (d for d, s in izip(data, selectors) if s)
import json
from warnings import warn

from jinja2 import Markup

from dxr.plugins import all_plugins
from dxr.utils import without_ending


class Line(object):
    """Representation of a line's beginning and ending as the contents of a tag

    Exists to motivate the balancing machinery to close all the tags at the end
    of every line (and reopen any afterward that span lines).

    """
    sort_order = 0  # Sort Lines outermost.

    def __repr__(self):
        return 'Line()'

LINE = Line()


class RefClassIdTagger(type):
    """Metaclass which automatically generates an ``id`` attr on the class as
    a serializable class identifier.

    Having a dedicated identifier allows Ref subclasses to move or change name
    without breaking index compatibility.

    Expects a ``_plugin`` attr to use as a prefix.

    """
    def __new__(metaclass, name, bases, dict):
        dict['id'] = without_ending('Ref', name)
        return type.__new__(metaclass, name, bases, dict)


[docs]class Ref(object): """Abstract superclass for a cross-reference attached to a run of text Carries enough data to construct a context menu, highlight instances of the same symbol, and show something informative on hover. """ sort_order = 1 __slots__ = ['menu_data', 'hover', 'qualname_hash'] __metaclass__ = RefClassIdTagger def __init__(self, tree, menu_data, hover=None, qualname=None, qualname_hash=None): """ :arg menu_data: Arbitrary JSON-serializable data from which we can construct a context menu :arg hover: The contents of the <a> tag's title attribute. (The first one wins.) :arg qualname: A hashable unique identifier for the symbol surrounded by this ref, for highlighting :arg qualname_hash: The hashed version of ``qualname``, which you can pass instead of ``qualname`` if you have access to the already-hashed version """ self.tree = tree self.menu_data = menu_data self.hover = hover self.qualname_hash = hash(qualname) if qualname else qualname_hash
[docs] def es(self): """Return a serialization of myself to store in elasticsearch.""" ret = {'plugin': self.plugin, 'id': self.id, # Smash the data into a string, because it will have a # different schema from subclass to subclass, and ES will freak # out: 'menu_data': json.dumps(self.menu_data)} if self.hover: ret['hover'] = self.hover if self.qualname_hash is not None: # could be 0 ret['qualname_hash'] = self.qualname_hash return ret
[docs] @staticmethod def es_to_triple(es_data, tree): """Convert ES-dwelling ref representation to a (start, end, :class:`~dxr.lines.Ref` subclass) triple. Return a subclass of Ref, chosen according to the ES data. Into its attributes "menu_data", "hover" and "qualname_hash", copy the ES properties of the same names, JSON-decoding "menu_data" first. :arg es_data: An item from the array under the 'refs' key of an ES LINE document :arg tree: The :class:`~dxr.config.TreeConfig` representing the tree from which the ``es_data`` was pulled """ def ref_class(plugin, id): """Return the subclass of Ref identified by a combination of plugin and class ID.""" plugins = all_plugins() try: return plugins[plugin].refs[id] except KeyError: warn('Ref subclass from plugin %s with ID %s was referenced ' 'in the index but not found in the current ' 'implementation. Ignored.' % (plugin, id)) payload = es_data['payload'] cls = ref_class(payload['plugin'], payload['id']) return (es_data['start'], es_data['end'], cls(tree, json.loads(payload['menu_data']), hover=payload.get('hover'), qualname_hash=payload.get('qualname_hash')))
[docs] def menu_items(self): """Return an iterable of menu items to be attached to a ref. Return an iterable of dicts of this form:: { html: the HTML to be used as the menu item itself href: the URL to visit when the menu item is chosen title: the tooltip text given on hovering over the menu item icon: the icon to show next to the menu item: the name of a PNG from the ``icons`` folder, without the .png extension } Typically, this pulls data out of ``self.menu_data``. """ raise NotImplementedError
[docs] def opener(self): """Emit the opening anchor tag for a cross reference. Menu item text, links, and metadata are JSON-encoded and dumped into a data attr on the tag. JS finds them there and creates a menu on click. """ if self.hover: title = ' title="' + cgi.escape(self.hover, True) + '"' else: title = '' if self.qualname_hash is not None: cls = ' data-id="tok%i"' % self.qualname_hash else: cls = '' menu_items = list(self.menu_items()) return u'<a data-menu="%s"%s%s>' % ( cgi.escape(json.dumps(menu_items), True), title, cls)
def closer(self): return u'</a>'
[docs]class Region(object): """A <span> tag with a CSS class, wrapped around a run of text""" # Sort Regions innermost, as it doesn't matter if we split them. sort_order = 2 __slots__ = ['css_class'] def __init__(self, css_class): self.css_class = css_class def es(self): return self.css_class
[docs] @classmethod def es_to_triple(cls, es_region): """Convert ES-dwelling region representation to a (start, end, :class:`~dxr.lines.Region`) triple.""" return es_region['start'], es_region['end'], cls(es_region['payload'])
def opener(self): return u'<span class="%s">' % cgi.escape(self.css_class, True) def closer(self): return u'</span>' def __repr__(self): """Return a nice representation for debugging.""" return 'Region("%s")' % self.css_class
def balanced_tags(tags): """Come up with a balanced series of tags which express the semantics of the given sorted interleaved ones. Return an iterable of (point, is_start, Region/Reg/Line) without any (pointless) zero-width tag spans. The output isn't necessarily optimal, but it's fast and not embarrassingly wasteful of space. """ return without_empty_tags(balanced_tags_with_empties(tags)) def without_empty_tags(tags): """Filter zero-width tagged spans out of a sorted, balanced tag stream. Maintain tag order. Line break tags are considered self-closing. """ buffer = [] # tags depth = 0 for tag in tags: point, is_start, payload = tag if is_start: buffer.append(tag) depth += 1 else: top_point, _, top_payload = buffer[-1] if top_payload is payload and top_point == point: # It's a closer, and it matches the last thing in buffer and, it # and that open tag form a zero-width span. Cancel the last thing # in buffer. buffer.pop() else: # It's an end tag that actually encloses some stuff. buffer.append(tag) depth -= 1 # If we have a balanced set of non-zero-width tags, emit them: if not depth: for b in buffer: yield b del buffer[:] def balanced_tags_with_empties(tags): """Come up with a balanced series of tags which express the semantics of the given sorted interleaved ones. Return an iterable of (point, is_start, Region/Reg/Line), possibly including some zero-width tag spans. Each line is enclosed within Line tags. :arg tags: An iterable of (offset, is_start, payload) tuples, with one closer for each opener but possibly interleaved. There is one tag for each line break, with a payload of LINE and an is_start of False. Tags are ordered with closers first, then line breaks, then openers. """ def close(to=None): """Return an iterable of closers for open tags up to (but not including) the one with the payload ``to``.""" # Loop until empty (if we're not going "to" anything in particular) or # until the corresponding opener is at the top of the stack. We check # that "to is None" just to surface any stack-tracking bugs that would # otherwise cause opens to empty too soon. while opens if to is None else opens[-1] is not to: intermediate_payload = opens.pop() yield point, False, intermediate_payload closes.append(intermediate_payload) def reopen(): """Yield open tags for all temporarily closed ones.""" while closes: intermediate_payload = closes.pop() yield point, True, intermediate_payload opens.append(intermediate_payload) opens = [] # payloads of tags which are currently open closes = [] # payloads of tags which we've had to temporarily close so we could close an overlapping tag point = 0 yield 0, True, LINE for point, is_start, payload in tags: if is_start: yield point, is_start, payload opens.append(payload) elif payload is LINE: # Close all open tags before a line break (since each line is # wrapped in its own <code> tag pair), and reopen them afterward. for t in close(): # I really miss "yield from". yield t # Since preserving self-closing linebreaks would throw off # without_empty_tags(), we convert to explicit closers here. We # surround each line with them because empty balanced ones would # get filtered out. yield point, False, LINE yield point, True, LINE for t in reopen(): yield t else: # Temporarily close whatever's been opened between the start tag of # the thing we're trying to close and here: for t in close(to=payload): yield t # Close the current tag: yield point, False, payload opens.pop() # Reopen the temporarily closed ones: for t in reopen(): yield t yield point, False, LINE def tag_boundaries(tags): """Return a sequence of (offset, is_start, Region/Ref/Line) tuples. Basically, split the atomic tags that come out of plugins into separate start and end points, which can then be thrown together in a bag and sorted as the first step in the tag-balancing process. Like in Python slice notation, the offset of a tag refers to the index of the source code char it comes before. :arg tags: An iterable of (start, end, Ref) and (start, end, Region) tuples """ for start, end, data in tags: # Filter out zero-length spans which don't do any good and # which can cause starts to sort after ends, crashing the tag # balancer. Incidentally filter out spans where start tags come # after end tags, though that should never happen. # # Also filter out None starts and ends. I don't know where they # come from. That shouldn't happen and should be fixed in the # plugins. if (start is not None and start != -1 and end is not None and end != -1 and start < end): yield start, True, data yield end, False, data def line_boundaries(lines): """Return a tag for the end of each line in a string. :arg lines: iterable of the contents of lines in a file, including any trailing newline character Endpoints and start points are coincident: right after a (universal) newline. """ up_to = 0 for line in lines: up_to += len(line) yield up_to, False, LINE def non_overlapping_refs(tags): """Yield a False for each Ref in ``tags`` that overlaps a subsequent one, a True for the rest. Assumes the incoming tags, while not necessarily well balanced, have the start tag come before the end tag, if both are present. (Lines are weird.) """ blacklist = set() open_ref = None for point, is_start, payload in tags: if isinstance(payload, Ref): if payload in blacklist: # It's the evil close tag of a misnested tag. blacklist.remove(payload) yield False elif open_ref is None: # and is_start: (should always be true if input is sane) assert is_start open_ref = payload yield True elif open_ref is payload: # it's the closer open_ref = None yield True else: # It's an evil open tag of a misnested tag. warn('htmlifier plugins requested overlapping <a> tags. Fix the plugins.') blacklist.add(payload) yield False else: yield True def remove_overlapping_refs(tags): """For any series of <a> tags that overlap each other, filter out all but the first. There's no decent way to represent that sort of thing in the UI, so we don't support it. :arg tags: A list of (point, is_start, payload) tuples, sorted by point. The tags do not need to be properly balanced. """ # Reuse the list so we don't use any more memory. i = None for i, tag in enumerate(compress(tags, non_overlapping_refs(tags))): tags[i] = tag if i is not None: del tags[i + 1:] def nesting_order((point, is_start, payload)): """Return a sorting key that places coincident Line boundaries outermost, then Ref boundaries, and finally Region boundaries. The Line bit saves some empty-tag elimination. The Ref bit saves splitting an <a> tag (and the attendant weird UI) for the following case:: Ref ____________ # The Ref should go on the outside. Region _____ Other scenarios:: Reg _______________ # Would be nice if Reg ended before Ref Ref ________________ # started. We'll see about this later. Reg _____________________ # Works either way Ref _______ Reg _____________________ Ref _______ # This should be fine. Reg _____________ # This should be fine as well. Ref ____________ Reg _____ Ref _____ # This is fine either way. Also, endpoints sort before coincident start points to save work for the tag balancer. """ return point, is_start, (payload.sort_order if is_start else -payload.sort_order) def finished_tags(lines, refs, regions): """Return an ordered iterable of properly nested tags which fully describe the refs and regions and their places in a file's text. :arg lines: iterable of lines of text of the file to htmlify. Benchmarking reveals that this function is O(number of tags) in practice, on inputs on the order of thousands of lines. On my laptop, it takes .02s for a 3000-line file with some pygmentize regions and some python refs. """ # Plugins return unicode offsets, not byte ones. # balanced_tags undoes the sorting, but we tolerate that in html_lines(). # Remark: this sort is the memory peak, but it is not a significant use of # time in an indexing run. tags = sorted(chain(tag_boundaries(chain(refs, regions)), line_boundaries(lines)), key=nesting_order) remove_overlapping_refs(tags) return balanced_tags(tags) def tags_per_line(flat_tags): """Split tags on LINE tags, yielding the tags of one line at a time (no LINE tags are yielded) :arg flat_tags: An iterable of ordered, non-overlapping, non-empty tag boundaries with Line endpoints at (and outermost at) the index of the end of each line. """ tags = [] for tag in flat_tags: point, is_start, payload = tag if payload is LINE: if not is_start: yield tags tags = [] else: tags.append(tag) def es_lines(tags): """Yield lists of dicts, one per source code line, that can be indexed into the ``refs`` or ``regions`` field of the ``line`` doctype in elasticsearch, depending on the payload type. :arg tags: An iterable of ordered, non-overlapping, non-empty tag boundaries with Line endpoints at (and outermost at) the index of the end of each line. """ for line in tags_per_line(tags): payloads = {} for pos, is_start, payload in line: if is_start: payloads[payload] = {'start': pos} else: payloads[payload]['end'] = pos # Index objects are refs or regions. Regions' payloads are just # strings; refs' payloads are objects. See mappings in plugins/core.py yield [{'payload': payload.es(), 'start': pos['start'], 'end': pos['end']} for payload, pos in payloads.iteritems()] # tags always ends with a LINE closer, so we don't need any additional # yield here to catch remnants. def html_line(text, tags, bof_offset): """Return a line of Markup, interleaved with the refs and regions that decorate it. :arg tags: An ordered iterable of tags from output of finished_tags representing regions and refs :arg text: The unicode text to decorate :arg bof_offset: The byte position of the start of the line from the beginning of the file. """ def segments(text, tags, bof_offset): up_to = 0 for pos, is_start, payload in tags: # Convert from file-based position to line-based position. pos -= bof_offset yield cgi.escape(text[up_to:pos]) up_to = pos if not is_start: # It's a closer. Most common. yield payload.closer() else: yield payload.opener() yield cgi.escape(text[up_to:]) return Markup(u''.join(segments(text, tags, bof_offset)))