mirror of
https://github.com/pierre42100/comunic
synced 2025-01-04 18:09:00 +00:00
413 lines
14 KiB
PHP
413 lines
14 KiB
PHP
|
<?php
|
||
|
/*
|
||
|
* Python scanner - includes Django
|
||
|
*
|
||
|
* TODO: Django does not respect {% comment %} ... {% endcomment %}
|
||
|
*/
|
||
|
class LuminousPythonScanner extends LuminousScanner {
|
||
|
|
||
|
public $django = false;
|
||
|
|
||
|
public function init() {
|
||
|
|
||
|
$this->remove_filter('comment-to-doc');
|
||
|
|
||
|
// so it turns out this template isn't quite as readable as I hoped, but
|
||
|
// it's a triple string, e.g:
|
||
|
// "{3} (?: [^"\\]+ | ""[^"\\]+ | "[^"\\]+ | \\.)* (?: "{3}|$)
|
||
|
|
||
|
|
||
|
$triple_str_template = '%1$s{3} (?> [^%1$s\\\\]+ | %1$s%1$s[^%1$s\\\\]+ | %1$s[^%1$s\\\\]+ | \\\\. )* (?: %1$s{3}|$)';
|
||
|
$str_template = '%1$s (?> [^%1$s\\\\]+ | \\\\. )* (?: %1$s|$)';
|
||
|
$triple_dstr = sprintf($triple_str_template, '"');
|
||
|
$triple_sstr = sprintf($triple_str_template, "'");
|
||
|
|
||
|
$this->add_pattern('IDENT', '/[a-zA-Z_](?>\w*)(?!["\'])/');
|
||
|
// I *assume* that Django tags terminate these
|
||
|
$this->add_pattern('COMMENT', sprintf('/\#.*%s/',
|
||
|
$this->django? '(?=[%}]\})' : ''));
|
||
|
|
||
|
|
||
|
// decorator
|
||
|
$this->add_pattern('TYPE', '/@(\w+\.?)+/');
|
||
|
|
||
|
// Python strings may be prefixed with r (raw) or u (unicode).
|
||
|
// This affects how it handles backslashes, but I don't *think* it
|
||
|
// affects escaping of quotes....
|
||
|
$this->add_pattern('STRING', "/[RUru]?$triple_dstr/xs");
|
||
|
$this->add_pattern('STRING', "/[RUru]?$triple_sstr/xs");
|
||
|
$this->add_pattern('STRING', "/[RUru]?" . sprintf($str_template, '"') . '/sx');
|
||
|
$this->add_pattern('STRING', "/[RUru]?" . sprintf($str_template, "'") . '/xs');
|
||
|
|
||
|
// EPIC.
|
||
|
$this->add_pattern('NUMERIC', '/
|
||
|
#hex
|
||
|
(?:0[xX](?>[0-9A-Fa-f]+)[lL]*)
|
||
|
|
|
||
|
# binary
|
||
|
(?:0[bB][0-1]+)
|
||
|
|
|
||
|
#octal
|
||
|
(?:0[oO0][0-7]+)
|
||
|
|
|
||
|
# regular number
|
||
|
(?:
|
||
|
(?>[0-9]+)
|
||
|
(?:
|
||
|
# long identifier
|
||
|
[lL]
|
||
|
|
|
||
|
# Or a fractional part, which may be imaginary
|
||
|
(?:
|
||
|
(?:\.?(?>[0-9]+)?
|
||
|
(?:(?:[eE][\+\-]?)?(?>[0-9]+))?
|
||
|
)[jJ]?
|
||
|
)
|
||
|
)?
|
||
|
)
|
||
|
|
|
||
|
(
|
||
|
# or only after the point, float x = .1;
|
||
|
\.(?>[0-9]+)(?:(?:[eE][\+\-]?)?(?>[0-9]+))?[jJ]?
|
||
|
)
|
||
|
/x');
|
||
|
|
||
|
// %} and }} are django terminators
|
||
|
if ($this->django) {
|
||
|
$this->add_pattern('TERM', '/[%}]\}/');
|
||
|
}
|
||
|
|
||
|
// catch the colon separately so we can use $match === ':' in figuring out
|
||
|
// where docstrs occur
|
||
|
$this->add_pattern('OPERATOR', '/\+=|-=|\*=|\/=|>=|<=|!=|==|\*\*|[!%^*\-=+;<>\\\\(){}\[\],\\.:]/');
|
||
|
|
||
|
if ($this->django) {
|
||
|
// Django specific keywords
|
||
|
// https://docs.djangoproject.com/en/1.3/ref/templates/builtins/
|
||
|
$this->add_identifier_mapping('KEYWORD', array('autoescape',
|
||
|
'endautoescape', 'cycle', 'filter', 'endfilter', 'include',
|
||
|
'extends', 'firstof', 'empty', 'ifchanged', 'endifchanged',
|
||
|
'ifequal', 'endifequal', 'ifnotequal', 'endifnotequal',
|
||
|
'load', 'now', 'regroup', 'spaceless', 'endspaceless',
|
||
|
'ssi', 'url', 'widthratio', 'endwith',
|
||
|
'endfor', 'endif',
|
||
|
'endwhile'));
|
||
|
}
|
||
|
|
||
|
$this->add_identifier_mapping('KEYWORD', array('assert', 'as', 'break',
|
||
|
'class', 'continue', 'del', 'def', 'elif', 'else', 'except', 'exec',
|
||
|
'finally', 'for', 'from', 'global', 'if', 'import', 'lambda',
|
||
|
'print', 'pass', 'raise', 'return', 'try', 'while', 'yield',
|
||
|
'with',
|
||
|
'and', 'not', 'in', 'is', 'or', 'print'));
|
||
|
|
||
|
$this->add_identifier_mapping('FUNCTION', array('all', 'abs', 'any',
|
||
|
'basestring', 'bin', 'callable', 'chr', 'classmethod', 'cmp', 'compile',
|
||
|
'dir', 'divmod', 'enumerate', 'eval', 'execfile', 'file', 'filter',
|
||
|
'format',
|
||
|
'frozenset', 'getattr', 'globals', 'hasattr', 'hash', 'help', 'hex',
|
||
|
'id', 'input', 'isinstance', 'issubclass', 'iter', 'len', 'locals', 'map',
|
||
|
'max', 'min', 'memoryview', 'next', 'object', 'oct', 'open', 'ord', 'pow',
|
||
|
'property', 'range', 'raw_input', 'reduce', 'reload', 'repr', 'reversed',
|
||
|
'round', 'setattr', 'slice', 'sorted', 'staticmethod', 'sum', 'super',
|
||
|
'type', 'unichr', 'vars', 'xrange', 'zip', '__import__',
|
||
|
|
||
|
'bytearray', 'complex', 'dict', 'float', 'int', 'list', 'long',
|
||
|
'set', 'str', 'tuple', 'unicode', 'apply', 'buffer', 'coerce', 'intern'
|
||
|
));
|
||
|
|
||
|
// http://docs.python.org/library/exceptions.html
|
||
|
$this->add_identifier_mapping('TYPE',
|
||
|
array('BaseException', 'SystemExit',
|
||
|
'KeyboardInterrupt', 'GeneratorExit', 'Exception', 'StopIteration',
|
||
|
'StandardError', 'BufferError', 'ArithmeticError',
|
||
|
'FloatingPointError', 'OverflowError', 'ZeroDivisionError',
|
||
|
'AssertionError',
|
||
|
'AttributeError', 'EnvironmentError', 'IOError', 'OSError',
|
||
|
'WindowsError(Windows)', 'VMSError(VMS)', 'EOFError', 'ImportError',
|
||
|
'LookupError', 'IndexError', 'KeyError', 'MemoryError', 'NameError',
|
||
|
'UnboundLocalError', 'ReferenceError', 'RuntimeError',
|
||
|
'NotImplementedError',
|
||
|
'SyntaxError', 'IndentationError', 'TabError', 'SystemError', 'TypeError',
|
||
|
'ValueError', 'UnicodeError', 'UnicodeDecodeError', 'UnicodeEncodeError',
|
||
|
'UnicodeTranslateError', 'Warning', 'DeprecationWarning',
|
||
|
'PendingDeprecationWarning', 'RuntimeWarning', 'SyntaxWarning',
|
||
|
'UserWarning',
|
||
|
'FutureWarning', 'ImportWarning', 'UnicodeWarning', 'BytesWarning'));
|
||
|
|
||
|
$this->add_identifier_mapping('VALUE', array('False', 'None', 'self',
|
||
|
'True'));
|
||
|
}
|
||
|
|
||
|
|
||
|
// mini-scanner to handle highlighting module names in import lines
|
||
|
private function import_line() {
|
||
|
$import = false;
|
||
|
$from = false;
|
||
|
while(!$this->eol()) {
|
||
|
$c = $this->peek();
|
||
|
$tok = null;
|
||
|
$m = null;
|
||
|
|
||
|
if ($c === '\\') $m = $this->get(2);
|
||
|
elseif($this->scan('/[,\\.;\\*]+/')) $tok = 'OPERATOR';
|
||
|
elseif($this->scan("/[ \t]+/")){}
|
||
|
elseif(($m = $this->scan('/import\\b|from\\b/'))){
|
||
|
if ($m === 'import') $import = true;
|
||
|
elseif($m === 'from') $from = true;
|
||
|
else assert(0);
|
||
|
$tok = 'IDENT';
|
||
|
}
|
||
|
elseif($this->scan('/[_a-zA-Z]\w*/')) {
|
||
|
assert($from || $import);
|
||
|
// from module import *item*, or just import *item*
|
||
|
if ($import) {
|
||
|
$tok = 'USER_FUNCTION';
|
||
|
$this->user_defs[$this->match()] = 'TYPE';
|
||
|
}
|
||
|
// from *module* ...[import item], the module is not imported
|
||
|
else $tok = 'IDENT';
|
||
|
}
|
||
|
else break;
|
||
|
$this->record(($m !== null)? $m : $this->match(), $tok);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
function main() {
|
||
|
$definition = false;
|
||
|
$doccstr = false;
|
||
|
$expect = '';
|
||
|
while (!$this->eos()) {
|
||
|
$tok = null;
|
||
|
$index = $this->pos();
|
||
|
|
||
|
if (($rule = $this->next_match()) !== null) {
|
||
|
$tok = $rule[0];
|
||
|
if ($rule[1] > $index) {
|
||
|
$this->record(substr($this->string(), $index, $rule[1] - $index), null);
|
||
|
}
|
||
|
} else {
|
||
|
$this->record(substr($this->string(), $index), null);
|
||
|
$this->terminate();
|
||
|
break;
|
||
|
}
|
||
|
// Django terminator tag - break to superscanner
|
||
|
if ($tok === 'TERM') {
|
||
|
$this->unscan();
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
$m = $this->match();
|
||
|
|
||
|
/* python doc strs are a pain because they're actually just strings.
|
||
|
* Also, I'm pretty sure a string in a non-interesting place just counts
|
||
|
* as a no-op and is also used as a comment sometimes
|
||
|
* So we've got something a bit complicated going on here: if we meet
|
||
|
* a 'class' or a 'def' (function def) then we wait until the next ':'
|
||
|
* and say "we expect a doc-str now". If the next token is not a string,
|
||
|
* we discard that state.
|
||
|
*
|
||
|
* similarly, if we meet a string which isn't a doc-str, we look behind
|
||
|
* and expect to see an operator or open bracket, else it's a comment.
|
||
|
* NOTE: we class ':' as a legal string preceding char because it's used
|
||
|
* as dictionary key:value separators. This will fail on the case:
|
||
|
*
|
||
|
* while 1:
|
||
|
* "do something"
|
||
|
* break
|
||
|
*
|
||
|
*
|
||
|
* NOTE: note we're skipping whitespace.
|
||
|
* NOTE: we disable the no-op detection for Django because the string
|
||
|
* might be inside an output tag.
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
if ($definition && $doccstr) {
|
||
|
if($tok === 'STRING')
|
||
|
$tok = 'COMMENT';
|
||
|
}
|
||
|
|
||
|
elseif ($tok === 'STRING' && !$this->django) {
|
||
|
$i = count($this->tokens);
|
||
|
$tok = 'COMMENT';
|
||
|
while ($i--) {
|
||
|
$t = $this->tokens[$i][0];
|
||
|
$s = $this->tokens[$i][1];
|
||
|
if ($t === null || $t === 'COMMENT') continue;
|
||
|
elseif ($t === 'OPERATOR' || $t === 'IDENT' || $t === 'NUMERIC') {
|
||
|
$tok = 'STRING';
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
// finally, if we can look ahead to a binary operator, or so,
|
||
|
// we concede it probably is a string
|
||
|
if ($tok === 'COMMENT') {
|
||
|
if ($this->check('/\s*(?: [+:&.,] | (?:and|or|is|not)\\b)/x'))
|
||
|
$tok = 'STRING';
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// reset this; if it didn't catch above then it's not valid now.
|
||
|
if ($definition && $doccstr) {
|
||
|
$definition = false;
|
||
|
$doccstr = false;
|
||
|
}
|
||
|
|
||
|
if ($tok === 'IDENT') {
|
||
|
if ($m === 'import' || $m === 'from') {
|
||
|
$this->unscan();
|
||
|
$this->import_line();
|
||
|
continue;
|
||
|
}
|
||
|
// these are definition keywords, the next token should be an
|
||
|
// identifier, which is a user-defined type or function
|
||
|
if ($m === 'class' || $m === 'def') {
|
||
|
$definition = true;
|
||
|
$expect = 'user_def';
|
||
|
}
|
||
|
// this is caught on the next iteration
|
||
|
elseif($expect === 'user_def') {
|
||
|
$tok = 'USER_FUNCTION';
|
||
|
$expect = false;
|
||
|
$this->user_defs[$m] = 'FUNCTION';
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
// if this hasn't caught, it's not valid
|
||
|
$expect = false;
|
||
|
}
|
||
|
|
||
|
if ($definition && $m === ':') {
|
||
|
$doccstr = true;
|
||
|
}
|
||
|
|
||
|
$this->record($m, $tok);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
public static function guess_language($src, $info) {
|
||
|
if (strpos($info['shebang'], 'python') !== false) return 1.0;
|
||
|
if ($info['shebang']) return 0.0;
|
||
|
$p = 0.0;
|
||
|
// let's look for some trademark pythonic constructs, although I
|
||
|
// have a feeling that recent versions of ECMA also impelment some
|
||
|
// of this
|
||
|
if (preg_match('/^\s*+ for \s++ \w++ \s++ in \s++ \w++ \s*+ :/xm', $src))
|
||
|
$p += 0.05;
|
||
|
if (preg_match('/True|False|None/', $src)) $p += 0.01;
|
||
|
if (preg_match('/"{3}|\'{3}/', $src)) $p += 0.05;
|
||
|
// class something(object)
|
||
|
//
|
||
|
if (preg_match('/^\s*+ class \s++ \w++ \s*+ \( \s*+ object \s*+ \)/xm',
|
||
|
$src)) $p += 0.1;
|
||
|
// def __init__ (constructor)
|
||
|
if (preg_match('/\\bdef \s++ __init__\\b/x', $src)) $p += 0.2;
|
||
|
// method decorators
|
||
|
if (preg_match("/^\s*+ @[\w\\.]++ .*+ [\n\r]++ \s*+ def\\b/mx", $src))
|
||
|
$p += 0.1;
|
||
|
// pmax = 0.41
|
||
|
|
||
|
// common imports: import os|sys|re
|
||
|
if (preg_match('/^import\s++(os|sys|re)\\b/m', $src))
|
||
|
$p += 0.05;
|
||
|
// from x import y
|
||
|
if (preg_match('/^\s*+ from \s++ (?:\w++(?:\.\w++)*+) \s++ import \s/xm',
|
||
|
$src))
|
||
|
$p += 0.10;
|
||
|
|
||
|
|
||
|
return $p;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
class LuminousDjangoScanner extends LuminousScanner {
|
||
|
// warning: some copying and pasting with the rails scanner here
|
||
|
|
||
|
// HTML scanner has to be persistent.
|
||
|
private $html_scanner;
|
||
|
|
||
|
public function init() {
|
||
|
$this->html_scanner = new LuminousHTMLScanner();
|
||
|
$this->html_scanner->string($this->string());
|
||
|
$this->html_scanner->embedded_server = true;
|
||
|
$this->html_scanner->server_tags = '/\{[{%#]/';
|
||
|
$this->html_scanner->init();
|
||
|
}
|
||
|
|
||
|
public function scan_html() {
|
||
|
$this->html_scanner->pos($this->pos());
|
||
|
$this->html_scanner->main();
|
||
|
$this->record($this->html_scanner->tagged(), null, true);
|
||
|
$this->pos($this->html_scanner->pos());
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
public function scan_python($short=false) {
|
||
|
$python_scanner = new LuminousPythonScanner($this->string());
|
||
|
$python_scanner->django = true;
|
||
|
$python_scanner->init();
|
||
|
$python_scanner->pos($this->pos());
|
||
|
$python_scanner->main();
|
||
|
$this->record($python_scanner->tagged(), $short? 'INTERPOLATION' : null, true);
|
||
|
$this->pos($python_scanner->pos());
|
||
|
}
|
||
|
|
||
|
|
||
|
public function main() {
|
||
|
while(!$this->eos()) {
|
||
|
$p = $this->pos();
|
||
|
// django's tags are {{ }} and {% %}
|
||
|
// there's also a {# #} comment tag but we can probably handle that here
|
||
|
// more easily
|
||
|
// same for {% comment %} ... {% endcomment %}
|
||
|
if ($this->scan('/\{([{%])/')) {
|
||
|
$match = $this->match();
|
||
|
$m1 = $this->match_group(1);
|
||
|
// {% comment %} ... {% endcomment %}
|
||
|
if ($this->scan('/\s*comment\s*%\}/')) {
|
||
|
$match .= $this->match();
|
||
|
$end_pattern = '/\{%\s*endcomment\s*%\}/';
|
||
|
if ($this->scan_until($end_pattern) !== null) {
|
||
|
$match .= $this->match();
|
||
|
$match .= $this->scan($end_pattern);
|
||
|
}
|
||
|
else {
|
||
|
$match .= $this->rest();
|
||
|
$this->terminate();
|
||
|
}
|
||
|
$this->record($match, 'COMMENT');
|
||
|
}
|
||
|
// {{ ... }} or {% ... %}
|
||
|
else {
|
||
|
$this->record($match, 'DELIMITER');
|
||
|
$this->scan_python($m1 === '{');
|
||
|
if ($this->scan('/[}%]\}/')) {
|
||
|
$this->record($this->match(), 'DELIMITER');
|
||
|
}
|
||
|
}
|
||
|
// {# ... #}
|
||
|
} elseif($this->scan('/\{\# (?: [^\#]++ | \#(?! \} ) )*+ (?: \#\} | $)/x')) {
|
||
|
$this->record($this->match(), 'COMMENT');
|
||
|
}
|
||
|
else {
|
||
|
$this->scan_html();
|
||
|
}
|
||
|
assert($p < $this->pos());
|
||
|
}
|
||
|
}
|
||
|
|
||
|
public static function guess_language($src, $info) {
|
||
|
if (($html = LuminousHTMLScanner::guess_language($src, $info)) >= 0.2) {
|
||
|
if (strpos($src, '{{') !== false || strpos($src, '{%') !== false)
|
||
|
return $html + 0.01;
|
||
|
}
|
||
|
return 0.0;
|
||
|
}
|
||
|
}
|