"""Machinery for interspersing lines of text with linked and colored regions
The typical entrypoints are es_lines() and html_line().
Within this file, "tag" means a tuple of (file-wide offset, is_start, payload).
"""
import cgi
from itertools import chain
try:
from itertools import compress
except ImportError:
from itertools import izip
def compress(data, selectors):
return (d for d, s in izip(data, selectors) if s)
import json
from warnings import warn
from jinja2 import Markup
from dxr.plugins import all_plugins
from dxr.utils import without_ending
class Line(object):
"""Representation of a line's beginning and ending as the contents of a tag
Exists to motivate the balancing machinery to close all the tags at the end
of every line (and reopen any afterward that span lines).
"""
sort_order = 0 # Sort Lines outermost.
def __repr__(self):
return 'Line()'
LINE = Line()
class RefClassIdTagger(type):
"""Metaclass which automatically generates an ``id`` attr on the class as
a serializable class identifier.
Having a dedicated identifier allows Ref subclasses to move or change name
without breaking index compatibility.
Expects a ``_plugin`` attr to use as a prefix.
"""
def __new__(metaclass, name, bases, dict):
dict['id'] = without_ending('Ref', name)
return type.__new__(metaclass, name, bases, dict)
[docs]class Ref(object):
"""Abstract superclass for a cross-reference attached to a run of text
Carries enough data to construct a context menu, highlight instances of
the same symbol, and show something informative on hover.
"""
sort_order = 1
__slots__ = ['menu_data', 'hover', 'qualname_hash']
__metaclass__ = RefClassIdTagger
def __init__(self, tree, menu_data, hover=None, qualname=None, qualname_hash=None):
"""
:arg menu_data: Arbitrary JSON-serializable data from which we can
construct a context menu
:arg hover: The contents of the <a> tag's title attribute. (The first
one wins.)
:arg qualname: A hashable unique identifier for the symbol surrounded
by this ref, for highlighting
:arg qualname_hash: The hashed version of ``qualname``, which you can
pass instead of ``qualname`` if you have access to the
already-hashed version
"""
self.tree = tree
self.menu_data = menu_data
self.hover = hover
self.qualname_hash = hash(qualname) if qualname else qualname_hash
[docs] def es(self):
"""Return a serialization of myself to store in elasticsearch."""
ret = {'plugin': self.plugin,
'id': self.id,
# Smash the data into a string, because it will have a
# different schema from subclass to subclass, and ES will freak
# out:
'menu_data': json.dumps(self.menu_data)}
if self.hover:
ret['hover'] = self.hover
if self.qualname_hash is not None: # could be 0
ret['qualname_hash'] = self.qualname_hash
return ret
[docs] @staticmethod
def es_to_triple(es_data, tree):
"""Convert ES-dwelling ref representation to a (start, end,
:class:`~dxr.lines.Ref` subclass) triple.
Return a subclass of Ref, chosen according to the ES data. Into its
attributes "menu_data", "hover" and "qualname_hash", copy the ES
properties of the same names, JSON-decoding "menu_data" first.
:arg es_data: An item from the array under the 'refs' key of an ES LINE
document
:arg tree: The :class:`~dxr.config.TreeConfig` representing the tree
from which the ``es_data`` was pulled
"""
def ref_class(plugin, id):
"""Return the subclass of Ref identified by a combination of
plugin and class ID."""
plugins = all_plugins()
try:
return plugins[plugin].refs[id]
except KeyError:
warn('Ref subclass from plugin %s with ID %s was referenced '
'in the index but not found in the current '
'implementation. Ignored.' % (plugin, id))
payload = es_data['payload']
cls = ref_class(payload['plugin'], payload['id'])
return (es_data['start'],
es_data['end'],
cls(tree,
json.loads(payload['menu_data']),
hover=payload.get('hover'),
qualname_hash=payload.get('qualname_hash')))
[docs] def opener(self):
"""Emit the opening anchor tag for a cross reference.
Menu item text, links, and metadata are JSON-encoded and dumped into a
data attr on the tag. JS finds them there and creates a menu on click.
"""
if self.hover:
title = ' title="' + cgi.escape(self.hover, True) + '"'
else:
title = ''
if self.qualname_hash is not None:
cls = ' data-id="tok%i"' % self.qualname_hash
else:
cls = ''
menu_items = list(self.menu_items())
return u'<a data-menu="%s"%s%s>' % (
cgi.escape(json.dumps(menu_items), True),
title,
cls)
def closer(self):
return u'</a>'
[docs]class Region(object):
"""A <span> tag with a CSS class, wrapped around a run of text"""
# Sort Regions innermost, as it doesn't matter if we split them.
sort_order = 2
__slots__ = ['css_class']
def __init__(self, css_class):
self.css_class = css_class
def es(self):
return self.css_class
[docs] @classmethod
def es_to_triple(cls, es_region):
"""Convert ES-dwelling region representation to a (start, end,
:class:`~dxr.lines.Region`) triple."""
return es_region['start'], es_region['end'], cls(es_region['payload'])
def opener(self):
return u'<span class="%s">' % cgi.escape(self.css_class, True)
def closer(self):
return u'</span>'
def __repr__(self):
"""Return a nice representation for debugging."""
return 'Region("%s")' % self.css_class
def balanced_tags(tags):
"""Come up with a balanced series of tags which express the semantics of
the given sorted interleaved ones.
Return an iterable of (point, is_start, Region/Reg/Line) without any
(pointless) zero-width tag spans. The output isn't necessarily optimal, but
it's fast and not embarrassingly wasteful of space.
"""
return without_empty_tags(balanced_tags_with_empties(tags))
def without_empty_tags(tags):
"""Filter zero-width tagged spans out of a sorted, balanced tag stream.
Maintain tag order. Line break tags are considered self-closing.
"""
buffer = [] # tags
depth = 0
for tag in tags:
point, is_start, payload = tag
if is_start:
buffer.append(tag)
depth += 1
else:
top_point, _, top_payload = buffer[-1]
if top_payload is payload and top_point == point:
# It's a closer, and it matches the last thing in buffer and, it
# and that open tag form a zero-width span. Cancel the last thing
# in buffer.
buffer.pop()
else:
# It's an end tag that actually encloses some stuff.
buffer.append(tag)
depth -= 1
# If we have a balanced set of non-zero-width tags, emit them:
if not depth:
for b in buffer:
yield b
del buffer[:]
def balanced_tags_with_empties(tags):
"""Come up with a balanced series of tags which express the semantics of
the given sorted interleaved ones.
Return an iterable of (point, is_start, Region/Reg/Line), possibly
including some zero-width tag spans. Each line is enclosed within Line tags.
:arg tags: An iterable of (offset, is_start, payload) tuples, with one
closer for each opener but possibly interleaved. There is one tag for
each line break, with a payload of LINE and an is_start of False. Tags
are ordered with closers first, then line breaks, then openers.
"""
def close(to=None):
"""Return an iterable of closers for open tags up to (but not
including) the one with the payload ``to``."""
# Loop until empty (if we're not going "to" anything in particular) or
# until the corresponding opener is at the top of the stack. We check
# that "to is None" just to surface any stack-tracking bugs that would
# otherwise cause opens to empty too soon.
while opens if to is None else opens[-1] is not to:
intermediate_payload = opens.pop()
yield point, False, intermediate_payload
closes.append(intermediate_payload)
def reopen():
"""Yield open tags for all temporarily closed ones."""
while closes:
intermediate_payload = closes.pop()
yield point, True, intermediate_payload
opens.append(intermediate_payload)
opens = [] # payloads of tags which are currently open
closes = [] # payloads of tags which we've had to temporarily close so we could close an overlapping tag
point = 0
yield 0, True, LINE
for point, is_start, payload in tags:
if is_start:
yield point, is_start, payload
opens.append(payload)
elif payload is LINE:
# Close all open tags before a line break (since each line is
# wrapped in its own <code> tag pair), and reopen them afterward.
for t in close(): # I really miss "yield from".
yield t
# Since preserving self-closing linebreaks would throw off
# without_empty_tags(), we convert to explicit closers here. We
# surround each line with them because empty balanced ones would
# get filtered out.
yield point, False, LINE
yield point, True, LINE
for t in reopen():
yield t
else:
# Temporarily close whatever's been opened between the start tag of
# the thing we're trying to close and here:
for t in close(to=payload):
yield t
# Close the current tag:
yield point, False, payload
opens.pop()
# Reopen the temporarily closed ones:
for t in reopen():
yield t
yield point, False, LINE
def tag_boundaries(tags):
"""Return a sequence of (offset, is_start, Region/Ref/Line) tuples.
Basically, split the atomic tags that come out of plugins into separate
start and end points, which can then be thrown together in a bag and sorted
as the first step in the tag-balancing process.
Like in Python slice notation, the offset of a tag refers to the index of
the source code char it comes before.
:arg tags: An iterable of (start, end, Ref) and (start, end, Region) tuples
"""
for start, end, data in tags:
# Filter out zero-length spans which don't do any good and
# which can cause starts to sort after ends, crashing the tag
# balancer. Incidentally filter out spans where start tags come
# after end tags, though that should never happen.
#
# Also filter out None starts and ends. I don't know where they
# come from. That shouldn't happen and should be fixed in the
# plugins.
if (start is not None and start != -1 and
end is not None and end != -1 and
start < end):
yield start, True, data
yield end, False, data
def line_boundaries(lines):
"""Return a tag for the end of each line in a string.
:arg lines: iterable of the contents of lines in a file, including any
trailing newline character
Endpoints and start points are coincident: right after a (universal)
newline.
"""
up_to = 0
for line in lines:
up_to += len(line)
yield up_to, False, LINE
def non_overlapping_refs(tags):
"""Yield a False for each Ref in ``tags`` that overlaps a subsequent one,
a True for the rest.
Assumes the incoming tags, while not necessarily well balanced, have the
start tag come before the end tag, if both are present. (Lines are weird.)
"""
blacklist = set()
open_ref = None
for point, is_start, payload in tags:
if isinstance(payload, Ref):
if payload in blacklist: # It's the evil close tag of a misnested tag.
blacklist.remove(payload)
yield False
elif open_ref is None: # and is_start: (should always be true if input is sane)
assert is_start
open_ref = payload
yield True
elif open_ref is payload: # it's the closer
open_ref = None
yield True
else: # It's an evil open tag of a misnested tag.
warn('htmlifier plugins requested overlapping <a> tags. Fix the plugins.')
blacklist.add(payload)
yield False
else:
yield True
def remove_overlapping_refs(tags):
"""For any series of <a> tags that overlap each other, filter out all but
the first.
There's no decent way to represent that sort of thing in the UI, so we
don't support it.
:arg tags: A list of (point, is_start, payload) tuples, sorted by point.
The tags do not need to be properly balanced.
"""
# Reuse the list so we don't use any more memory.
i = None
for i, tag in enumerate(compress(tags, non_overlapping_refs(tags))):
tags[i] = tag
if i is not None:
del tags[i + 1:]
def nesting_order((point, is_start, payload)):
"""Return a sorting key that places coincident Line boundaries outermost,
then Ref boundaries, and finally Region boundaries.
The Line bit saves some empty-tag elimination. The Ref bit saves splitting
an <a> tag (and the attendant weird UI) for the following case::
Ref ____________ # The Ref should go on the outside.
Region _____
Other scenarios::
Reg _______________ # Would be nice if Reg ended before Ref
Ref ________________ # started. We'll see about this later.
Reg _____________________ # Works either way
Ref _______
Reg _____________________
Ref _______ # This should be fine.
Reg _____________ # This should be fine as well.
Ref ____________
Reg _____
Ref _____ # This is fine either way.
Also, endpoints sort before coincident start points to save work for the
tag balancer.
"""
return point, is_start, (payload.sort_order if is_start else
-payload.sort_order)
def finished_tags(lines, refs, regions):
"""Return an ordered iterable of properly nested tags which fully describe
the refs and regions and their places in a file's text.
:arg lines: iterable of lines of text of the file to htmlify.
Benchmarking reveals that this function is O(number of tags) in practice,
on inputs on the order of thousands of lines. On my laptop, it takes .02s
for a 3000-line file with some pygmentize regions and some python refs.
"""
# Plugins return unicode offsets, not byte ones.
# balanced_tags undoes the sorting, but we tolerate that in html_lines().
# Remark: this sort is the memory peak, but it is not a significant use of
# time in an indexing run.
tags = sorted(chain(tag_boundaries(chain(refs, regions)),
line_boundaries(lines)),
key=nesting_order)
remove_overlapping_refs(tags)
return balanced_tags(tags)
def tags_per_line(flat_tags):
"""Split tags on LINE tags, yielding the tags of one line at a time
(no LINE tags are yielded)
:arg flat_tags: An iterable of ordered, non-overlapping, non-empty tag
boundaries with Line endpoints at (and outermost at) the index of the
end of each line.
"""
tags = []
for tag in flat_tags:
point, is_start, payload = tag
if payload is LINE:
if not is_start:
yield tags
tags = []
else:
tags.append(tag)
def es_lines(tags):
"""Yield lists of dicts, one per source code line, that can be indexed
into the ``refs`` or ``regions`` field of the ``line`` doctype in
elasticsearch, depending on the payload type.
:arg tags: An iterable of ordered, non-overlapping, non-empty tag
boundaries with Line endpoints at (and outermost at) the index of the
end of each line.
"""
for line in tags_per_line(tags):
payloads = {}
for pos, is_start, payload in line:
if is_start:
payloads[payload] = {'start': pos}
else:
payloads[payload]['end'] = pos
# Index objects are refs or regions. Regions' payloads are just
# strings; refs' payloads are objects. See mappings in plugins/core.py
yield [{'payload': payload.es(),
'start': pos['start'],
'end': pos['end']}
for payload, pos in payloads.iteritems()]
# tags always ends with a LINE closer, so we don't need any additional
# yield here to catch remnants.
def html_line(text, tags, bof_offset):
"""Return a line of Markup, interleaved with the refs and regions that
decorate it.
:arg tags: An ordered iterable of tags from output of finished_tags
representing regions and refs
:arg text: The unicode text to decorate
:arg bof_offset: The byte position of the start of the line from the
beginning of the file.
"""
def segments(text, tags, bof_offset):
up_to = 0
for pos, is_start, payload in tags:
# Convert from file-based position to line-based position.
pos -= bof_offset
yield cgi.escape(text[up_to:pos])
up_to = pos
if not is_start: # It's a closer. Most common.
yield payload.closer()
else:
yield payload.opener()
yield cgi.escape(text[up_to:])
return Markup(u''.join(segments(text, tags, bof_offset)))