remove_filter('comment-to-doc'); // so it turns out this template isn't quite as readable as I hoped, but // it's a triple string, e.g: // "{3} (?: [^"\\]+ | ""[^"\\]+ | "[^"\\]+ | \\.)* (?: "{3}|$) $triple_str_template = '%1$s{3} (?> [^%1$s\\\\]+ | %1$s%1$s[^%1$s\\\\]+ | %1$s[^%1$s\\\\]+ | \\\\. )* (?: %1$s{3}|$)'; $str_template = '%1$s (?> [^%1$s\\\\]+ | \\\\. )* (?: %1$s|$)'; $triple_dstr = sprintf($triple_str_template, '"'); $triple_sstr = sprintf($triple_str_template, "'"); $this->add_pattern('IDENT', '/[a-zA-Z_](?>\w*)(?!["\'])/'); // I *assume* that Django tags terminate these $this->add_pattern('COMMENT', sprintf('/\#.*%s/', $this->django? '(?=[%}]\})' : '')); // decorator $this->add_pattern('TYPE', '/@(\w+\.?)+/'); // Python strings may be prefixed with r (raw) or u (unicode). // This affects how it handles backslashes, but I don't *think* it // affects escaping of quotes.... $this->add_pattern('STRING', "/[RUru]?$triple_dstr/xs"); $this->add_pattern('STRING', "/[RUru]?$triple_sstr/xs"); $this->add_pattern('STRING', "/[RUru]?" . sprintf($str_template, '"') . '/sx'); $this->add_pattern('STRING', "/[RUru]?" . sprintf($str_template, "'") . '/xs'); // EPIC. $this->add_pattern('NUMERIC', '/ #hex (?:0[xX](?>[0-9A-Fa-f]+)[lL]*) | # binary (?:0[bB][0-1]+) | #octal (?:0[oO0][0-7]+) | # regular number (?: (?>[0-9]+) (?: # long identifier [lL] | # Or a fractional part, which may be imaginary (?: (?:\.?(?>[0-9]+)? (?:(?:[eE][\+\-]?)?(?>[0-9]+))? )[jJ]? ) )? ) | ( # or only after the point, float x = .1; \.(?>[0-9]+)(?:(?:[eE][\+\-]?)?(?>[0-9]+))?[jJ]? ) /x'); // %} and }} are django terminators if ($this->django) { $this->add_pattern('TERM', '/[%}]\}/'); } // catch the colon separately so we can use $match === ':' in figuring out // where docstrs occur $this->add_pattern('OPERATOR', '/\+=|-=|\*=|\/=|>=|<=|!=|==|\*\*|[!%^*\-=+;<>\\\\(){}\[\],\\.:]/'); if ($this->django) { // Django specific keywords // https://docs.djangoproject.com/en/1.3/ref/templates/builtins/ $this->add_identifier_mapping('KEYWORD', array('autoescape', 'endautoescape', 'cycle', 'filter', 'endfilter', 'include', 'extends', 'firstof', 'empty', 'ifchanged', 'endifchanged', 'ifequal', 'endifequal', 'ifnotequal', 'endifnotequal', 'load', 'now', 'regroup', 'spaceless', 'endspaceless', 'ssi', 'url', 'widthratio', 'endwith', 'endfor', 'endif', 'endwhile')); } $this->add_identifier_mapping('KEYWORD', array('assert', 'as', 'break', 'class', 'continue', 'del', 'def', 'elif', 'else', 'except', 'exec', 'finally', 'for', 'from', 'global', 'if', 'import', 'lambda', 'print', 'pass', 'raise', 'return', 'try', 'while', 'yield', 'with', 'and', 'not', 'in', 'is', 'or', 'print')); $this->add_identifier_mapping('FUNCTION', array('all', 'abs', 'any', 'basestring', 'bin', 'callable', 'chr', 'classmethod', 'cmp', 'compile', 'dir', 'divmod', 'enumerate', 'eval', 'execfile', 'file', 'filter', 'format', 'frozenset', 'getattr', 'globals', 'hasattr', 'hash', 'help', 'hex', 'id', 'input', 'isinstance', 'issubclass', 'iter', 'len', 'locals', 'map', 'max', 'min', 'memoryview', 'next', 'object', 'oct', 'open', 'ord', 'pow', 'property', 'range', 'raw_input', 'reduce', 'reload', 'repr', 'reversed', 'round', 'setattr', 'slice', 'sorted', 'staticmethod', 'sum', 'super', 'type', 'unichr', 'vars', 'xrange', 'zip', '__import__', 'bytearray', 'complex', 'dict', 'float', 'int', 'list', 'long', 'set', 'str', 'tuple', 'unicode', 'apply', 'buffer', 'coerce', 'intern' )); // http://docs.python.org/library/exceptions.html $this->add_identifier_mapping('TYPE', array('BaseException', 'SystemExit', 'KeyboardInterrupt', 'GeneratorExit', 'Exception', 'StopIteration', 'StandardError', 'BufferError', 'ArithmeticError', 'FloatingPointError', 'OverflowError', 'ZeroDivisionError', 'AssertionError', 'AttributeError', 'EnvironmentError', 'IOError', 'OSError', 'WindowsError(Windows)', 'VMSError(VMS)', 'EOFError', 'ImportError', 'LookupError', 'IndexError', 'KeyError', 'MemoryError', 'NameError', 'UnboundLocalError', 'ReferenceError', 'RuntimeError', 'NotImplementedError', 'SyntaxError', 'IndentationError', 'TabError', 'SystemError', 'TypeError', 'ValueError', 'UnicodeError', 'UnicodeDecodeError', 'UnicodeEncodeError', 'UnicodeTranslateError', 'Warning', 'DeprecationWarning', 'PendingDeprecationWarning', 'RuntimeWarning', 'SyntaxWarning', 'UserWarning', 'FutureWarning', 'ImportWarning', 'UnicodeWarning', 'BytesWarning')); $this->add_identifier_mapping('VALUE', array('False', 'None', 'self', 'True')); } // mini-scanner to handle highlighting module names in import lines private function import_line() { $import = false; $from = false; while(!$this->eol()) { $c = $this->peek(); $tok = null; $m = null; if ($c === '\\') $m = $this->get(2); elseif($this->scan('/[,\\.;\\*]+/')) $tok = 'OPERATOR'; elseif($this->scan("/[ \t]+/")){} elseif(($m = $this->scan('/import\\b|from\\b/'))){ if ($m === 'import') $import = true; elseif($m === 'from') $from = true; else assert(0); $tok = 'IDENT'; } elseif($this->scan('/[_a-zA-Z]\w*/')) { assert($from || $import); // from module import *item*, or just import *item* if ($import) { $tok = 'USER_FUNCTION'; $this->user_defs[$this->match()] = 'TYPE'; } // from *module* ...[import item], the module is not imported else $tok = 'IDENT'; } else break; $this->record(($m !== null)? $m : $this->match(), $tok); } } function main() { $definition = false; $doccstr = false; $expect = ''; while (!$this->eos()) { $tok = null; $index = $this->pos(); if (($rule = $this->next_match()) !== null) { $tok = $rule[0]; if ($rule[1] > $index) { $this->record(substr($this->string(), $index, $rule[1] - $index), null); } } else { $this->record(substr($this->string(), $index), null); $this->terminate(); break; } // Django terminator tag - break to superscanner if ($tok === 'TERM') { $this->unscan(); break; } $m = $this->match(); /* python doc strs are a pain because they're actually just strings. * Also, I'm pretty sure a string in a non-interesting place just counts * as a no-op and is also used as a comment sometimes * So we've got something a bit complicated going on here: if we meet * a 'class' or a 'def' (function def) then we wait until the next ':' * and say "we expect a doc-str now". If the next token is not a string, * we discard that state. * * similarly, if we meet a string which isn't a doc-str, we look behind * and expect to see an operator or open bracket, else it's a comment. * NOTE: we class ':' as a legal string preceding char because it's used * as dictionary key:value separators. This will fail on the case: * * while 1: * "do something" * break * * * NOTE: note we're skipping whitespace. * NOTE: we disable the no-op detection for Django because the string * might be inside an output tag. * */ if ($definition && $doccstr) { if($tok === 'STRING') $tok = 'COMMENT'; } elseif ($tok === 'STRING' && !$this->django) { $i = count($this->tokens); $tok = 'COMMENT'; while ($i--) { $t = $this->tokens[$i][0]; $s = $this->tokens[$i][1]; if ($t === null || $t === 'COMMENT') continue; elseif ($t === 'OPERATOR' || $t === 'IDENT' || $t === 'NUMERIC') { $tok = 'STRING'; } break; } // finally, if we can look ahead to a binary operator, or so, // we concede it probably is a string if ($tok === 'COMMENT') { if ($this->check('/\s*(?: [+:&.,] | (?:and|or|is|not)\\b)/x')) $tok = 'STRING'; } } // reset this; if it didn't catch above then it's not valid now. if ($definition && $doccstr) { $definition = false; $doccstr = false; } if ($tok === 'IDENT') { if ($m === 'import' || $m === 'from') { $this->unscan(); $this->import_line(); continue; } // these are definition keywords, the next token should be an // identifier, which is a user-defined type or function if ($m === 'class' || $m === 'def') { $definition = true; $expect = 'user_def'; } // this is caught on the next iteration elseif($expect === 'user_def') { $tok = 'USER_FUNCTION'; $expect = false; $this->user_defs[$m] = 'FUNCTION'; } } else { // if this hasn't caught, it's not valid $expect = false; } if ($definition && $m === ':') { $doccstr = true; } $this->record($m, $tok); } } public static function guess_language($src, $info) { if (strpos($info['shebang'], 'python') !== false) return 1.0; if ($info['shebang']) return 0.0; $p = 0.0; // let's look for some trademark pythonic constructs, although I // have a feeling that recent versions of ECMA also impelment some // of this if (preg_match('/^\s*+ for \s++ \w++ \s++ in \s++ \w++ \s*+ :/xm', $src)) $p += 0.05; if (preg_match('/True|False|None/', $src)) $p += 0.01; if (preg_match('/"{3}|\'{3}/', $src)) $p += 0.05; // class something(object) // if (preg_match('/^\s*+ class \s++ \w++ \s*+ \( \s*+ object \s*+ \)/xm', $src)) $p += 0.1; // def __init__ (constructor) if (preg_match('/\\bdef \s++ __init__\\b/x', $src)) $p += 0.2; // method decorators if (preg_match("/^\s*+ @[\w\\.]++ .*+ [\n\r]++ \s*+ def\\b/mx", $src)) $p += 0.1; // pmax = 0.41 // common imports: import os|sys|re if (preg_match('/^import\s++(os|sys|re)\\b/m', $src)) $p += 0.05; // from x import y if (preg_match('/^\s*+ from \s++ (?:\w++(?:\.\w++)*+) \s++ import \s/xm', $src)) $p += 0.10; return $p; } } class LuminousDjangoScanner extends LuminousScanner { // warning: some copying and pasting with the rails scanner here // HTML scanner has to be persistent. private $html_scanner; public function init() { $this->html_scanner = new LuminousHTMLScanner(); $this->html_scanner->string($this->string()); $this->html_scanner->embedded_server = true; $this->html_scanner->server_tags = '/\{[{%#]/'; $this->html_scanner->init(); } public function scan_html() { $this->html_scanner->pos($this->pos()); $this->html_scanner->main(); $this->record($this->html_scanner->tagged(), null, true); $this->pos($this->html_scanner->pos()); } public function scan_python($short=false) { $python_scanner = new LuminousPythonScanner($this->string()); $python_scanner->django = true; $python_scanner->init(); $python_scanner->pos($this->pos()); $python_scanner->main(); $this->record($python_scanner->tagged(), $short? 'INTERPOLATION' : null, true); $this->pos($python_scanner->pos()); } public function main() { while(!$this->eos()) { $p = $this->pos(); // django's tags are {{ }} and {% %} // there's also a {# #} comment tag but we can probably handle that here // more easily // same for {% comment %} ... {% endcomment %} if ($this->scan('/\{([{%])/')) { $match = $this->match(); $m1 = $this->match_group(1); // {% comment %} ... {% endcomment %} if ($this->scan('/\s*comment\s*%\}/')) { $match .= $this->match(); $end_pattern = '/\{%\s*endcomment\s*%\}/'; if ($this->scan_until($end_pattern) !== null) { $match .= $this->match(); $match .= $this->scan($end_pattern); } else { $match .= $this->rest(); $this->terminate(); } $this->record($match, 'COMMENT'); } // {{ ... }} or {% ... %} else { $this->record($match, 'DELIMITER'); $this->scan_python($m1 === '{'); if ($this->scan('/[}%]\}/')) { $this->record($this->match(), 'DELIMITER'); } } // {# ... #} } elseif($this->scan('/\{\# (?: [^\#]++ | \#(?! \} ) )*+ (?: \#\} | $)/x')) { $this->record($this->match(), 'COMMENT'); } else { $this->scan_html(); } assert($p < $this->pos()); } } public static function guess_language($src, $info) { if (($html = LuminousHTMLScanner::guess_language($src, $info)) >= 0.2) { if (strpos($src, '{{') !== false || strpos($src, '{%') !== false) return $html + 0.01; } return 0.0; } }